This jupiter notebook combines multiple ways to automatically download PDFs based on metadata saved in a CSV file. The results from each download run are saved in separate folders, then combined at the end. Naming is standardized so that duplicate files are deleted. 
Please note that file and folder names must be adjusted based on the user's local naming systems. 

# run IEEE papers through IEEE code

not required - IEEE covered in API Keys code later in this notebook. 

In [None]:
import pandas as pd
import numpy as np
import openai
import json
from tqdm import tqdm
import ast
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import math
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import re
import os

In [None]:
# Read the CSV file and save it as a .pkl file
csv_path = "PID_IEEE.csv"
pkl_path = "PID_IEEE.pkl"
df_pid_ieee = pd.read_csv(csv_path)
df_pid_ieee.to_pickle(pkl_path)
df = df_pid_ieee
df.columns

In [None]:
def download_ieee_pdf(paper_id, download_path, paper_name):
    pdf_url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber={}'.format(paper_id)

    # Define headers to mimic a browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    # Send a request to download the PDF
    response = requests.get(pdf_url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        with open(f"{download_path}{paper_name}.pdf", "wb") as file:
            file.write(response.content)
        print(f"PDF downloaded successfully as {paper_name}")
    else:
        print("Failed to download the PDF. Status code:", response.status_code)

In [None]:
def get_ieee_paper_id(url):
    
    # The initial URL to open
    initial_url = url

    # Set up the WebDriver (Chrome)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Optional: Run in headless mode
    driver = webdriver.Chrome(options=options)

    ieee_paper_id = None
    try:
        # Step 1: Open the initial URL
        driver.get(initial_url)
        time.sleep(2)  # Wait for the page to load

        # Step 2: Capture the current URL of the page after any redirects or interactions
        current_url = driver.current_url

        # Step 3: Use regex to find the number after 'document/'
        match = re.search(r'document/(\d+)', current_url)
        if match:
            ieee_paper_id = match.group(1)
            print("Captured paper id:", ieee_paper_id)

    finally:
        # Close the WebDriver
        driver.quit()

    return ieee_paper_id

In [None]:
# Ensure the download directory exists
os.makedirs("./dl_ieee_EID_names/", exist_ok=True)

for n in tqdm(np.arange(0, 77)):
    inxid = int(inx_ieee[n])
    doi = df['DOI'][inxid]
    eid = str(df['EID'][inxid]).replace('.', '')  # Remove dot from EID
    title = re.sub(r"[\/\-?]", " ", df['Title'][inxid])
    pdf_name = f"{df['Type'][inxid]}-{df['Year'][inxid]}-{eid}"
    download_path = "./dl_ieee/"
    url = f"https://doi.org/{doi}"
    ieee_paper_id = get_ieee_paper_id(url)
    download_ieee_pdf(ieee_paper_id, download_path, pdf_name)

In [None]:
# re-run downloads for invalid files
def get_unreadable_pdf_numbers(folder_path):
    unreadable_numbers = []

    # Traverse the folder for PDF files
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".pdf"):
                file_path = os.path.join(root, file)
                try:
                    # Try to open the PDF file
                    reader = PdfReader(file_path)
                    _ = len(reader.pages)  # Force reading to check if it works
                except:
                    # Extract the first number before the first hyphen
                    match = re.match(r'(\d+)-', file)
                    if match:
                        unreadable_numbers.append(int(match.group(1)))

    return unreadable_numbers

In [None]:
folder_path = './dl_ieee'
unreadable_pdf_list = get_unreadable_pdf_numbers(folder_path)
len(unreadable_pdf_list)

In [None]:
for n in tqdm(unreadable_pdf_list[20:50]):
    inxid = inx_ieee[n]
    doi = df['DOI'][inx_ieee[n]]
    title = re.sub(r"[\/\-?]", " ", df['Title'][inx_ieee[n]] )
    pdf_name = str(n)+'-'+ str(inxid) + '-' + str(df['Year'][inx_ieee[n]]) + '-' + df['Type Code'][inx_ieee[n]] + '-' + title
    download_path = "./dl_ieee/"
    url = f"https://doi.org/{doi}"
    ieee_paper_id = get_ieee_paper_id(url)
    download_ieee_pdf(ieee_paper_id, download_path, pdf_name)

# run Elsevier downloads

not required - IEEE covered in API Keys code later in this notebook. 

In [None]:
import pandas as pd
import numpy as np
import openai
import json
from tqdm import tqdm
import ast
import plotly.express as px
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import math
import requests
from bs4 import BeautifulSoup
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import time
import os
import webbrowser
import requests
from bs4 import BeautifulSoup
import requests
import pyautogui
import os

In [None]:
# Load the CSV file and save it as a pickle file
import os

csv_path = "/Users/Rayna/Downloads/HackingMat/PID_elsevier.csv"
if os.path.exists(csv_path):
	df_elsevier = pd.read_csv(csv_path)
	df_elsevier.to_pickle("PID_elsevier.pkl")
	print("CSV loaded and pickle file saved.")
else:
	print(f"File '{csv_path}' not found. Please check the file path.")

df = pd.read_pickle('PID_elsevier.pkl')

In [None]:
# If you only want to filter by Publisher in a list of Elsevier names
inx_elsevier = df.index[df['Publisher'].isin(['Elsevier', 'Elsevier B.V.', 'Elsevier Ltd', 'Elsevier GmbH'])]

df.iloc[inx_elsevier]

In [None]:
# this function is to know the already downloaded papers 

def extract_second_number_from_pdfs(folder_path):
    """
    Finds all PDF files in a folder, extracts the second number from the file title,
    and returns a list of those numbers.

    Args:
        folder_path (str): The path to the folder containing the PDF files.

    Returns:
        list: A list of extracted second numbers (as integers), or an empty list if no matching files are found.
    """
    pdf_numbers = []
    try:
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(".pdf"):
                match = re.match(r'^\d+-(\d+)-\d+-.+', filename)  # Uses regex to find pattern
                if match:
                    second_number = int(match.group(1))
                    pdf_numbers.append(second_number)
        pdf_numbers.sort()  # Sort the list in ascending order
        return pdf_numbers

    except FileNotFoundError:
        print(f"Error: Folder '{folder_path}' not found.")
        return []
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [None]:
inx_download = inx_elsevier
category_path = './dl_elsevier/'
existing_paper_ids = extract_second_number_from_pdfs(category_path)

In [None]:
import os

base_dir = os.path.expanduser("~/Downloads/HackingMat")
folder_path = os.path.join(base_dir, "dl_elsevier_PID")
os.makedirs(folder_path, exist_ok=True)
print(f"Folder created at: {folder_path}")

In [None]:
api = '9cfa907a1d1e50bab0fe344be2646890' #replace with your API key

In [None]:
import re
import requests
from tqdm import tqdm

# Make sure this folder exists
download_folder = os.path.join(base_dir, "dl_elsevier_EID")
os.makedirs(download_folder, exist_ok=True)

for n, inxid in enumerate(tqdm(inx_elsevier)):
    if inxid not in existing_paper_ids:
        doi = df.at[inxid, 'DOI']
        if isinstance(doi, str):
            # Extract metadata
            paper_type = df.at[inxid, 'Type']
            year = df.at[inxid, 'Year']
            eid = str(df.at[inxid, 'EID']).replace('.', '')  # Remove dot from EID
            
            # Construct clean filename
            filename = f"{paper_type}_{year}_{eid}.pdf"
            filename_path = os.path.join(download_folder, filename)

            # Build the download URL
            url = f"https://api.elsevier.com/content/article/doi/{doi}?apiKey={api}&httpAccept=application%2Fpdf"

            try:
                response = requests.get(url, stream=True)
                response.raise_for_status()

                if response.headers.get('x-els-status') == 'OK' and response.headers.get('Content-Type') == 'application/pdf':
                    with open(filename_path, 'wb') as f:
                        f.write(response.content)
                    print(f"PDF saved as: {filename}")
                else:
                    print(f"Skipped: {filename} - Not a PDF or status not OK")

            except requests.exceptions.RequestException as e:
                print(f"Error downloading PDF for DOI {doi}: {e}")

# use fulltext_article_downloader for other publishers with APIs

In [None]:
# set API keys for publishers that provide them
import os
os.environ["ELSEVIER_API_KEY"] = "your api key"
os.environ["SPRINGER_API_KEY"] = "your api key"
os.environ["WILEY_API_KEY"] = "your api key"
os.environ["IEEE_API_KEY"] = "your api key"
os.environ["UNPAYWALL_EMAIL"] = "your email address"

In [None]:
import pandas as pd
from fulltext_article_downloader import bulk_download_articles  # assuming this is imported correctly
from tqdm import tqdm

# set file paths, load CSV and study type
csv_path = "path to CSV with metadata"  # Update to your file
output_base = "path to output folder"  # Main folder for downloaded PDFs
study_type = os.path.splitext(os.path.basename(csv_path))[0]  # e.g., "PID"
df = pd.read_csv(csv_path)
doi_list = df['DOI'].dropna().tolist()

naming format for files

In [None]:
# filename mapping
doi_to_filename = {}

def get_metadata_for_doi(row):
    year = str(row.get('Year', 'Unknown'))
    eid = str(row.get('EID', 'Unknown')).replace('.', '')
    type = str(row.get('Type', 'Unknown'))
    return f"{type}_{year}_{eid}"

if {'DOI', 'Year', 'Publisher', 'Title', 'EID', 'Type'}.issubset(df.columns):
    for _, row in df.iterrows():
        doi = row['DOI']
        if pd.notna(doi):
            doi_to_filename[doi] = get_metadata_for_doi(row)
else:
    raise ValueError("CSV is missing one or more required columns: DOI, Year, Title, Publisher, EID, Type")


In [None]:
# set download file
output_dir = os.path.join(output_base, f"{study_type}_papers")
os.makedirs(output_dir, exist_ok=True)
log_file = os.path.join(output_dir, f"{study_type}_download.log")

# count PDFs before download
pdf_count_before = sum(
    f.lower().endswith('.pdf')
    for root, _, files in os.walk(output_dir)
    for f in files
)

print(f"\nDownloading papers for {study_type} ({len(doi_list)} DOIs)...")
results = bulk_download_articles(
    doi_list,
    output_dir=output_dir,
    log_file=log_file,
    sleep=0.2,
)

# correctly name files
for filename in os.listdir(output_dir):
    if filename.endswith(".pdf"):
        original_path = os.path.join(output_dir, filename)

        # Try to extract the DOI
        doi_guess = filename.replace(".pdf", "").replace("_", "/")
        matched_doi = None

        if doi_guess in doi_to_filename:
            matched_doi = doi_guess
        else:
            for doi in doi_to_filename:
                if doi.replace("/", "_") in filename or doi in filename:
                    matched_doi = doi
                    break

        if matched_doi and matched_doi in doi_to_filename:
            new_name_raw = doi_to_filename[matched_doi]
            clean_name = "".join(c for c in new_name_raw if c.isalnum() or c in " ._-")
            clean_name = clean_name.strip().replace(" ", "_") + ".pdf"
            new_path = os.path.join(output_dir, clean_name)

            try:
                os.rename(original_path, new_path)
                print(f"Renamed: {filename} → {clean_name}")
            except Exception as e:
                print(f"Rename failed for {filename}: {e}")
        else:
            print(f"Could not match filename to DOI: {filename}")

# log failed DOIs
failed_dois = []
if results:
    for doi, status in results.items():
        pdf_name_guess = f"{doi.replace('/', '_')}.pdf"
        file_path = os.path.join(output_dir, pdf_name_guess)
        if not status or not os.path.exists(file_path):
            failed_row = df[df['DOI'] == doi]
            if not failed_row.empty:
                failed_dois.append(failed_row)

if failed_dois:
    failed_df = pd.concat(failed_dois, ignore_index=True)
    failed_csv_path = os.path.join(output_base, f"failed_{study_type}.csv")
    failed_df.to_csv(failed_csv_path, index=False)
    print(f"Saved {len(failed_df)} failed DOIs to: {failed_csv_path}")

# count PDFs
pdf_count_after = sum(
    f.lower().endswith('.pdf')
    for root, _, files in os.walk(output_dir)
    for f in files
)

print(f"\nTotal PDF files in '{output_dir}': {pdf_count_after}")
print(f"New PDFs downloaded: {pdf_count_after - pdf_count_before}")

# search with Selenium as a final backup

In [None]:
import os
import time
import shutil
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options

In [None]:
# set paths to CSV & download directory
# repeat by changing CSV to read (i.e. failed_glass) - use CSVs of DOIs that failed to download through API code
csv_path = 'path to your csv file with metadata' 
download_dir = 'path to desired output folder'
os.makedirs(download_dir, exist_ok=True)

# load CSV
df = pd.read_csv(csv_path)
df = df[df['EID'].notna()]  # Remove rows without DOI

# track failed download DOIs
failed_dois = []

# set chrome options & start browser
chrome_options = Options()
chrome_options.add_experimental_option("prefs", {
    "download.default_directory": download_dir,
    "download.prompt_for_download": False,
    "plugins.always_open_pdf_externally": True
})
chrome_options.add_argument("--start-maximized")
chrome_options.add_argument("--headless=new")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# download PDFs based on DOI
for index, row in df.iterrows():
    doi = row['DOI']
    year = str(row.get('Year', 'Unknown'))
    publisher = str(row.get('Publisher', 'Unknown')).replace('/', '-')
    eid = str(row.get('EID', 'Unknown')).replace('.', '')
    type = str(row.get('Type', 'Unknown'))
    title = str(row.get('Title', 'no_title'))[:50].replace('/', '-').replace('\\', '-').replace(' ', '_')
    url = f"https://doi.org/{doi}"

    print(f"Opening: {url}")

    # Record existing files to detect new ones after download
    before_files = set(os.listdir(download_dir))

    try:
        driver.get(url)
        time.sleep(5)  # Let redirect happen

        # Find a link with 'pdf' in href
        pdf_button = driver.find_element(By.XPATH, "//a[contains(@href, 'pdf')]")
        pdf_link = pdf_button.get_attribute("href")

        if pdf_link:
            driver.get(pdf_link)
            print(f"Downloading PDF for: {doi}")
            time.sleep(10)  # Adjust based on your connection

            # Detect new file
            after_files = set(os.listdir(download_dir))
            new_files = after_files - before_files
            pdf_file = None
            for file in new_files:
                if file.endswith('.pdf'):
                    pdf_file = file
                    break

            if pdf_file:
                new_name = f"{type}_{year}_{eid}.pdf"
                clean_name = "".join(c for c in new_name if c.isalnum() or c in " ._-").strip().replace(" ", "_")
                new_path = os.path.join(download_dir, clean_name)
                original_path = os.path.join(download_dir, pdf_file)
                shutil.move(original_path, new_path)
                print(f"Renamed to: {clean_name}")
            else:
                print(f"No new PDF detected for {doi}")
                failed_dois.append(row)

        else:
            print(f"No PDF link found for {doi}")
            failed_dois.append(row)

    except Exception as e:
        print(f"Error downloading {doi}: {e}")
        failed_dois.append(row)

# browser cleanup
driver.quit()

# write failed download DOIs to a new CSV
if failed_dois:
    failed_df = pd.DataFrame(failed_dois)
    failed_csv_path = os.path.join(os.path.dirname(csv_path), "selenium_failed_altypes.csv")
    failed_df.to_csv(failed_csv_path, index=False)
    print(f"Saved {len(failed_df)} failed DOIs to: {failed_csv_path}")
else:
    print("All PDFs downloaded successfully!")

print("Done downloading PDFs.")

# count and combine PDFs, delete duplicates

In [None]:
# check that the folder names are changed to match your local directory
import os

base_dir = "folder with all PDFs"
dirs = [
    "auto_downloads/PID_papers",
    "PID Papers by EID/dl_elsevier_EID",
    "PID Papers by EID/dl_ieee_EID"
] # subfolders (if applicable)

pdf_counts = {}
for d in dirs:
    full_path = os.path.join(base_dir, d)
    count = sum(
        f.lower().endswith('.pdf')
        for root, _, files in os.walk(full_path)
        for f in files
    )
    pdf_counts[d] = count

for d, count in pdf_counts.items():
    print(f"{d}: {count} PDF files")

In [None]:
import shutil

# Define source and destination directories
src_dirs = [
    "directories above"
]
dst_dir = "output directory for all PDFs, accounting for no duplicates"
os.makedirs(dst_dir, exist_ok=True)

# Helper to generate unique filename if duplicate exists
def get_unique_filename(dst_dir, filename):
    base, ext = os.path.splitext(filename)
    counter = 1
    new_filename = filename
    while os.path.exists(os.path.join(dst_dir, new_filename)):
        new_filename = f"{base}({counter}){ext}"
        counter += 1
    return new_filename

# Copy files, handling duplicates by appending a number
for src in src_dirs:
    for fname in os.listdir(src):
        if fname.lower().endswith('.pdf'):
            src_path = os.path.join(src, fname)
            unique_fname = get_unique_filename(dst_dir, fname)
            dst_path = os.path.join(dst_dir, unique_fname)
            shutil.copy2(src_path, dst_path)

# Count PDFs in the final folder
pdf_count = sum(
    f.lower().endswith('.pdf')
    for f in os.listdir(dst_dir)
)
print(f"Total PDF files in '{dst_dir}': {pdf_count}")