Upload necessary packages

In [1]:
!pip install arxiv requests pandas numpy matplotlib datetime semanticscholar 
!pip install pandas requests beautifulsoup4 fuzzywuzzy PyMuPDF



In [2]:
import arxiv
from semanticscholar import SemanticScholar
import urllib
import requests
import json
import csv
import pandas as pd
from collections import Counter, defaultdict
import numpy as np # for array manipulation
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline 
from datetime import datetime, date
import time
from bs4 import BeautifulSoup
from fuzzywuzzy import fuzz
import fitz  # PyMuPDF
import boto3
from botocore.exceptions import ClientError, NoCredentialsError
import os



Upload Query & Start Date

In [16]:
#Define Query
query = 'corrosion AND steel alloys AND liquid lead'
# Define the start date for filtering
start_date = datetime(2021, 6, 1)

ARXIV Papers Harvesting

In [17]:
results_generator = arxiv.Client(
  page_size=1000,
  delay_seconds=3,
  num_retries=3
).results(arxiv.Search(
  query= query,
  id_list=[],
  sort_by=arxiv.SortCriterion.Relevance,
  sort_order=arxiv.SortOrder.Descending,
))

query_results = []
for paper in results_generator:
  # You could do per-paper analysis here; for now, just collect them in a list.
  query_results.append(paper)

In [None]:
qd_df = pd.DataFrame([vars(paper) for paper in query_results])

In [None]:
# Convert start_date to UTC and make it timezone-aware
start_date = pd.to_datetime(start_date).tz_localize('UTC')

# Filter the dataframe to keep only papers updated after start_date
qd_df_filtered = qd_df[qd_df['updated'] > start_date]

# Select only the 'entry_id' and 'title' columns and rename them
qd_df_reconfigured = qd_df_filtered[['title', 'entry_id']].rename(columns={'title': 'Title', 'entry_id': 'URL'})

# Save the reconfigured DataFrame to a CSV file
qd_df_reconfigured.to_csv('research_papers_arxiv.csv', index=False)
print("CSV file 'research_papers_arxiv.csv' has been created successfully.")

Semantic Scholar Papers Harvesting (takes a long time can be replaced with direct endpoint access)

In [12]:
from semanticscholar import SemanticScholar
import csv
from datetime import datetime

# Initialize the SemanticScholar client
sch = SemanticScholar()

# Define the search query
query = 'corrosion steel alloys liquid lead'

# Define the start date for filtering
start_date = datetime(2021, 6, 1)

# Perform the search
results = sch.search_paper(query)

# Open a new CSV file for writing
with open('research_papers_semantic.csv', 'w', newline='', encoding='utf-8') as csvfile:
    # Create a CSV writer object
    csvwriter = csv.writer(csvfile)
    
    # Write the header row
    csvwriter.writerow(['Title', 'URL', 'Publication Date'])
    
    # Write the data rows
    for paper in results:
        title = paper.title if paper.title else 'No title available'
        open_access_pdf = paper.openAccessPdf
        
        # Handle the case where publicationDate is already a datetime object
        if isinstance(paper.publicationDate, datetime):
            publication_date = paper.publicationDate
        elif isinstance(paper.publicationDate, str):
            try:
                publication_date = datetime.strptime(paper.publicationDate, '%Y-%m-%d %H:%M:%S')
            except ValueError:
                publication_date = None
        else:
            publication_date = None
        
        # Only include papers with a non-None openAccessPdf field and publication date after the start date
        if isinstance(open_access_pdf, dict) and publication_date and publication_date >= start_date:
            url = open_access_pdf.get('url', 'No URL available')
            csvwriter.writerow([title, url, publication_date.strftime('%Y-%m-%d')])

print("CSV file 'research_papers_semantic.csv' has been created successfully.")

CSV file 'research_papers_semantic.csv' has been created successfully.


CORE Papers Harvesting

In [14]:
# adjusted CORE code to use correct url and limit date of downloads

# Define the CORE API endpoint and your API key
CORE_API_URL = "https://api.core.ac.uk/v3/search/works"
API_KEY = 'Your-API_key"  # Replace with your actual CORE API key

# Define the search query parameters
params = {
    "q": query,
    "limit": 50,  # Number of results to retrieve
    "apiKey": API_KEY
}



# Send a GET request to the CORE API
response = requests.get(CORE_API_URL, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()
    
    # Check for 'results' key and handle missing data
    if 'results' in data:
        # Prepare data for CSV
        papers = []
        for work in data['results']:
            title = work.get('title', 'No title available')
            created_date = datetime.strptime(work['createdDate'], '%Y-%m-%dT%H:%M:%S')
            download_url = work.get('downloadUrl', 'No download URL available')
            
            # Only include papers with a createdDate after the start date
            if created_date > start_date:
                papers.append({"Title": title, "Download URL": download_url})
        
        # Write data to CSV file
        with open('research_papers_CORE.csv', 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['Title', 'Download URL']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            writer.writeheader()
            writer.writerows(papers)
        
        print("CSV file 'research_papers_CORE.csv' created successfully.")
    else:
        print("No 'results' key found in response.")
else:
    print(f"Error: {response.status_code}")
    print(response.text)

CSV file 'research_papers_CORE.csv' created successfully.


Creation of a Unified List, Creation of S3 bucket, Downloading of Pdf Documents

In [15]:
print("Starting execution...")

# Step 1: Read the three CSV files
print("Reading CSV files...")
df1 = pd.read_csv('research_papers_arxiv.csv')
df2 = pd.read_csv('research_papers_semantic.csv')
df3 = pd.read_csv('research_papers_CORE.csv')
print(f"Read {len(df1)} rows from arxiv, {len(df2)} from semantic, and {len(df3)} from CORE")

# Step 2: Combine them into one DataFrame
print("Combining DataFrames...")
combined_df = pd.concat([df1, df2, df3], ignore_index=True)
print(f"Combined DataFrame has {len(combined_df)} rows")

# Step 3: Remove duplicate titles
print("Removing duplicates...")
def is_duplicate(title1, title2, threshold=90):
    return fuzz.ratio(title1.lower(), title2.lower()) > threshold

unique_titles = []
unique_rows = []
for _, row in combined_df.iterrows():
    title = row['Title']
    if pd.isna(title):
        continue
    title = str(title)
    if not any(is_duplicate(title, unique_title) for unique_title in unique_titles):
        unique_titles.append(title)
        unique_rows.append(row)

unique_df = pd.DataFrame(unique_rows, columns=['Title', 'URL'])
print(f"After removing duplicates, {len(unique_df)} rows remain")

# Step 4: Create a new CSV file with unique titles and URLs
print("Saving unique papers to CSV...")
unique_df.to_csv('unique_research_papers.csv', index=False)
print("Saved unique_research_papers.csv")

# Step 5: Create S3 bucket
def create_bucket(bucket_name, region=None):
    try:
        if region is None:
            s3_client = boto3.client('s3')
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            s3_client = boto3.client('s3', region_name=region)
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        print(f"Error creating bucket: {e}")
        return False
    except NoCredentialsError:
        print("No AWS credentials found. Please configure your AWS credentials.")
        return False
    return True

# Create the S3 bucket
print("Creating S3 bucket...")
bucket_name = 'transmutexresearchrepository'
if create_bucket(bucket_name):
    print(f"S3 bucket '{bucket_name}' created successfully")
else:
    print(f"Failed to create S3 bucket '{bucket_name}'. Stopping execution.")
    exit()

# Step 6: Download PDFs and upload to S3
def download_and_upload_pdf(url, title, bucket_name):
    try:
        print(f"Processing: {title}")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        
        safe_filename = "".join([c for c in title if c.isalpha() or c.isdigit() or c==' ']).rstrip()
        filename = f"{safe_filename}.pdf"
        
        with open(filename, 'wb') as f:
            f.write(response.content)
        
        s3_client = boto3.client('s3')
        s3_client.upload_file(filename, bucket_name, filename)
        
        os.remove(filename)
        
        print(f"Successfully processed: {filename}")
    except Exception as e:
        print(f"Failed to process {url}: {e}")

print("Downloading and uploading PDFs...")
for _, row in unique_df.iterrows():
    download_and_upload_pdf(row['URL'], row['Title'], bucket_name)
    time.sleep(1)

print("PDF processing completed.")

Starting execution...
Reading CSV files...
Read 1 rows from arxiv, 45 from semantic, and 0 from CORE
Combining DataFrames...
Combined DataFrame has 46 rows
Removing duplicates...
After removing duplicates, 45 rows remain
Saving unique papers to CSV...
Saved unique_research_papers.csv
Creating S3 bucket...
S3 bucket 'transmutexresearchrepository' created successfully
Downloading and uploading PDFs...
Processing: Circumventing cracking in grading 316L stainless steel to Monel400 through compositional modifications
Successfully processed: Circumventing cracking in grading 316L stainless steel to Monel400 through compositional modifications.pdf
Processing: Interaction Between Liquid Lead and FeNi Material Using Molecular Dynamics Simulation
Successfully processed: Interaction Between Liquid Lead and FeNi Material Using Molecular Dynamics Simulation.pdf
Processing: A Review of Corrosion Behavior of Structural Steel in Liquid Lead–Bismuth Eutectic
Successfully processed: A Review of Corrosio

Processing: Resistance against Abrasive Wear and Corrosion of Laser Powder Bed Alloyed High Chromium Tool Steels
Failed to process https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/srin.202200455: 403 Client Error: Forbidden for url: https://onlinelibrary.wiley.com/doi/pdfdirect/10.1002/srin.202200455
Processing: Metal ions release from metallic orthopedic implants exposed to tribocorrosion and electrochemical corrosion conditions in simulated body fluids: Clinical context and in vitro experimental investigations
Failed to process https://wjarr.com/sites/default/files/WJARR-2022-0438.pdf: 403 Client Error: Forbidden for url: https://wjarr.com/sites/default/files/WJARR-2022-0438.pdf
Processing: Natural Deep Eutectic Solvents Based on Choline Chloride and Phenolic Compounds as Efficient Bioadhesives and Corrosion Protectors
Failed to process https://pubs.acs.org/doi/pdf/10.1021/acssuschemeng.2c01976: 403 Client Error: Forbidden for url: https://pubs.acs.org/doi/pdf/10.1021/acssuschem