In [5]:
import os 
import pandas as pd
from unpywall import Unpywall
from unpywall.utils import UnpywallCredentials
import requests

In [6]:
combined_df = pd.read_csv("./COMBINED.csv")

In [7]:
# Create a folder for downloads
os.makedirs("./downloads", exist_ok=True)
UnpywallCredentials("sv22900@uga.edu")
# export UNPAYWALL_EMAIL=your_email@example.com

# Initialize Unpywall
unpywall = Unpywall()

In [None]:
combined_df[combined_df["DOI"].isna()]

Unnamed: 0                                                          1
Title               AGE AND GENDER BASED WORKLOAD CONSTRAINT FOR A...
Abstract            This paper investigates Assembly Line Worker A...
Authors                                  Efe, B; Kremer, GEO; Kurt, M
Keywords            {'physical workload', 'disabled workers', 'ass...
DOI                                                               NaN
ISSN                                                        1943-670X
Publication Year                                                 2018
Name: 1, dtype: object

In [None]:

result = unpywall.doi(dois=[doi])
display(result)

In [36]:
unpywall.download_pdf_file(doi,"save.pdf",progress=True)

In [29]:
# Function to download PDF
def download_pdf(doi, title):
    try:
        # Validate DOI and Title
        if pd.isna(doi) or pd.isna(title):
            print(f"Skipping: Invalid DOI or Title. DOI: {doi}, Title: {title}")
            return

        # Retrieve metadata from Unpaywall
        result = unpywall.doi(dois=[doi])
        if not result.empty:
            # Access the best open-access location URL
            oa_location = result.iloc[0]["best_oa_location.url"]
            if oa_location:
                # Check if URL points to a PDF
                response = requests.head(oa_location, allow_redirects=True)
                content_type = response.headers.get("Content-Type", "")
                if "application/pdf" not in content_type:
                    print(f"URL is not a PDF: {oa_location}")
                    return

                # Sanitize title for filename
                filename = f"{title.replace(' ', '_').replace('/', '_')}.pdf"
                filepath = os.path.join("./downloads", filename)

                # Download the PDF
                response = requests.get(oa_location, stream=True)
                if response.status_code == 200:
                    with open(filepath, "wb") as pdf_file:
                        for chunk in response.iter_content(chunk_size=1024):
                            if chunk:
                                pdf_file.write(chunk)
                    print(f"Downloaded: {title}")
                else:
                    print(f"Failed to download PDF for DOI: {doi} (HTTP {response.status_code})")
            else:
                print(f"No PDF URL found for DOI: {doi}")
        else:
            print(f"No data found for DOI: {doi}")
    except Exception as e:
        print(f"Error processing DOI {doi}: {e}")


# Iterate through DOIs in the DataFrame
for index, row in combined_df[:5].iterrows():
    doi = row.get("DOI", None)
    title = row.get("Title", None)
    download_pdf(doi, title)

Downloaded: Human-Machine Interaction: Adapted Safety Assistance in Mentality Using Hidden Markov Chain and Petri Net
Skipping: Invalid DOI or Title. DOI: nan, Title: AGE AND GENDER BASED WORKLOAD CONSTRAINT FOR ASSEMBLY LINE WORKER ASSIGNMENT AND BALANCING PROBLEM IN A TEXTILE FIRM
URL is not a PDF: https://www.tandfonline.com/doi/pdf/10.1080/21693277.2022.2090458?needAccess=true
Error processing DOI 10.1109/IC_ASET61847.2024.10596228: 'best_oa_location.url'
Error processing DOI 10.1109/THMS.2019.2903402: 'best_oa_location.url'


In [18]:
Unpywall.get_all_links(doi=combined_df.loc[0]["DOI"])

['https://www.mdpi.com/2076-3417/9/23/5066/pdf?version=1575369748']

In [None]:
def get_open_access_pdf(doi):
    try:
        response = unpaywall.get(doi)
        oa_link = response.get("best_oa_location", {}).get("url_for_pdf", None)
        return oa_link
    except Exception as e:
        return None


# Example usage
df["PDF_Link"] = df["DOI"].apply(get_open_access_pdf)