In [1]:
# Imports
import requests  
from bs4 import BeautifulSoup  
import pandas as pd  
import io  
from PyPDF2 import PdfReader  
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
# https://huggingface.co/com3dian/Bart-large-paper2slides-summarizer/blob/main/README.md?code=true#L22

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/stephenreagin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/stephenreagin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
#!pip3 install torch torchvision

In [3]:
def fetch_arxiv_data(search_query, max_results=10, start_date=None, end_date=None, primary_category=None, categories=None):
    """
    Fetches data from the arXiv API based on specified parameters.

    Parameters:
        search_query (str): The search query string.
        max_results (int): The maximum number of results to retrieve (default is 10).
        start_date (str): The start date for the search query in the format 'YYYY-MM-DD'.
        end_date (str): The end date for the search query in the format 'YYYY-MM-DD'.
        primary_category (str): The primary category of the articles.
        categories (str): Additional categories for the articles.

    Returns:
        DataFrame: A pandas DataFrame containing the fetched arXiv data.
    """
    api_url = "http://export.arxiv.org/api/query"  # Defining the API URL for arXiv
    params = {
        "search_query": search_query,  # Setting search query parameter
        "start": 0,  # Setting start parameter for pagination
        "max_results": max_results,  # Setting maximum results parameter
    }
    if start_date:
        params["start_date"] = start_date  # Adding start date parameter if provided
    if end_date:
        params["end_date"] = end_date  # Adding end date parameter if provided
    if primary_category:
        params["cat"] = primary_category  # Adding primary category parameter if provided
    if categories:
        params["categories"] = categories  # Adding additional categories parameter if provided

    response = requests.get(api_url, params=params)  # Making a GET request to arXiv API with specified parameters
    if response.status_code == 200:  # Checking if the request was successful
        feed = BeautifulSoup(response.content, features="html.parser")  # Parsing the response content using BeautifulSoup
        entries = feed.find_all('entry')  # Finding all 'entry' elements in the parsed content
        articles = []

        for entry in entries:  # Looping through each entry
            article = {}  # Creating an empty dictionary to store article data
            article['Title'] = entry.title.text  # Extracting title of the article
            article['Authors'] = [author.find('name').text for author in entry.find_all('author')]  # Extracting authors of the article
            article['Published'] = entry.published.text  # Extracting publication date
            article['Updated'] = entry.updated.text  # Extracting last updated date
            article['Summary'] = entry.summary.text.strip()  # Extracting summary of the article
            article['ID'] = entry.id.text  # Extracting unique ID of the article
            articles.append(article)  # Appending article data to the list

        df = pd.DataFrame(articles)  # Creating a pandas DataFrame from the list of articles
        return df  # Returning the DataFrame
    else:
        print("Failed to retrieve data from arXiv API")  # Printing error message if request fails
        return None

def download_pdf_from_link(link):
    """
    Downloads a PDF file from the given URL.

    Parameters:
        link (str): The URL of the PDF file.

    Returns:
        io.BytesIO or None: BytesIO object containing the PDF content if successful, else None.
    """
    response = requests.get(link, stream=True)  # Making a GET request to download the PDF file
    if response.status_code == 200:  # Checking if the request was successful
        return io.BytesIO(response.content)  # Returning BytesIO object containing the PDF content
    else:
        print(f"Failed to download PDF from {link}")  # Printing error message if download fails
        return None

def extract_text_from_pdf(pdf_io):
    """
    Extracts text from a PDF file.

    Parameters:
        pdf_io (io.BytesIO): BytesIO object containing the PDF content.

    Returns:
        str: Extracted text from the PDF.
    """
    if pdf_io:
        try:
            reader = PdfReader(pdf_io)  # Creating a PdfReader object with the PDF content
            text = ""
            for page in reader.pages:  # Looping through each page in the PDF
                text += page.extract_text() + "\n"  # Extracting text from the page and appending it to the 'text' variable
            return text  # Returning the extracted text
        except Exception as e:
            print(f"Error occurred while extracting text from PDF: {str(e)}")  # Printing error message if extraction fails
            return ""

def process_pdf_links(df):
    """
    Processes PDF links in a DataFrame by downloading and extracting text from them.

    Parameters:
        df (DataFrame): The pandas DataFrame containing PDF links.

    Returns:
        DataFrame: The DataFrame with an additional column containing extracted text from PDFs.
    """
    df['PDF_Text'] = ''  # Adding an empty column 'PDF_Text' to the DataFrame
    for index, row in df.iterrows():  # Looping through each row in the DataFrame
        link = row['ID']  # Extracting the PDF link from the 'ID' column
        # We need to convert the '/abs/' URL to a '/pdf/' URL and ensure it ends with '.pdf'
        link = link.replace('abs', 'pdf')  # Replacing 'abs' with 'pdf' in the URL
        if not link.endswith('.pdf'):  # Checking if the link ends with '.pdf'
            link += '.pdf'  # Appending '.pdf' to the link if it doesn't end with it already
        print(f"Processing link: {link}")  # Printing the processed link
        pdf_io = download_pdf_from_link(link)  # Downloading the PDF file
        if pdf_io:  # Checking if PDF download was successful
            text = extract_text_from_pdf(pdf_io)  # Extracting text from the PDF
            df.at[index, 'PDF_Text'] = text  # Adding the extracted text to the 'PDF_Text' column in the DataFrame
    return df  # Returning the updated DataFrame

def preprocess_text(text):
    """
    Preprocesses the text by converting it to lowercase, removing digits, and removing punctuation.

    Parameters:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def remove_stopwords(tokens):
    """
    Removes stopwords from the list of tokens.

    Parameters:
        tokens (list): List of tokens.

    Returns:
        list: Tokens with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words] 
    return filtered_tokens 

def perform_lemmatization(tokens):
    """
    Performs lemmatization on the list of tokens.

    Parameters:
        tokens (list): List of tokens.

    Returns:
        list: Lemmatized tokens.
    """
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def clean_text(text):
    """
    Cleans the text by preprocessing, removing stopwords, and performing lemmatization.

    Parameters:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    # Preprocess the text
    text = preprocess_text(text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = remove_stopwords(tokens)
    
    # Perform lemmatization
    lemmatized_tokens = perform_lemmatization(filtered_tokens)
    
    # Join the lemmatized tokens into a string
    cleaned_text = ' '.join(lemmatized_tokens)
    
    return cleaned_text
    

In [58]:
# Example usage
df = fetch_arxiv_data(search_query="gravity", max_results=5, start_date="2022-12-01", end_date="2022-12-31", primary_category="quant-ph", categories=["quant-ph", "cond-mat"])
df_with_pdf_text = process_pdf_links(df)
df_with_pdf_text

Processing link: http://arxiv.org/pdf/hep-th/0703034v2.pdf
Processing link: http://arxiv.org/pdf/2209.15047v2.pdf
Processing link: http://arxiv.org/pdf/hep-th/9701161v1.pdf
Processing link: http://arxiv.org/pdf/1212.5572v1.pdf
Processing link: http://arxiv.org/pdf/2003.03198v1.pdf


Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,PDF_Text
0,D=4 Einstein gravity from higher D CS and BI g...,[Horatiu Nastase],2007-03-05T08:56:06Z,2007-03-30T08:02:24Z,An alternative to usual dimensional reduction ...,http://arxiv.org/abs/hep-th/0703034v2,arXiv:hep-th/0703034v2 30 Mar 2007hep-th/0703...
1,How to quantize gravity and how not to quantiz...,[Philip D. Mannheim],2022-09-29T18:43:49Z,2023-01-16T02:34:47Z,Taking the quantization of electromagnetism as...,http://arxiv.org/abs/2209.15047v2,Springer Nature 2021 L ATEX template\nHow to q...
2,KPZ Formulas for Weyl-Invariant Induced Gravit...,[G. Amelino-Camelia],1997-01-28T15:32:11Z,1997-01-28T15:32:11Z,I discuss the applicability in Weyl-invariant ...,http://arxiv.org/abs/hep-th/9701161v1,arXiv:hep-th/9701161v1 28 Jan 19971\nKPZ Form...
3,On the no-gravity limit of gravity,"[J. Kowalski-Glikman, M. Szczachor]",2012-12-21T19:55:49Z,2012-12-21T19:55:49Z,We argue that Relative Locality may arise in t...,http://arxiv.org/abs/1212.5572v1,arXiv:1212.5572v1 [gr-qc] 21 Dec 2012Septemb...
4,"Comment on ""Inconsistencies in Verlinde's emer...",[Youngsub Yoon],2020-03-05T15:47:58Z,2020-03-05T15:47:58Z,"In 2016, Erik Verlinde proposed a new theory o...",http://arxiv.org/abs/2003.03198v1,arXiv:2003.03198v1 [gr-qc] 5 Mar 2020Comment...


In [59]:
df_with_pdf_text['PDF_Text'][0]

'arXiv:hep-th/0703034v2  30 Mar 2007hep-th/0703034\nTIT/HEP-566\nD=4 Einstein gravity from higher D CS and\nBI gravity and an alternative to dimensional\nreduction\nHoratiu Nastase∗\nGlobal Edge Institute,\nTokyo Institute of Technology\nTokyo 152-8550, Japan\nAbstract\nAn alternative to usual dimensional reduction for gravity i s an-\nalyzed, in the vielbein-spin connection formulation. Usua l 4d Ein-\nstein gravity plus a topological term (the ”Born-Infeld” La grangian\nfor gravity), is shown to be obtained by a generalized dimens ional\nreduction from 5d Chern-Simons gravity. Chern-Simons grav ity in\nd=2n+1 is dimensionally reduced to CS gravity in d=2n-1 via a\nmechanism similar to descent equations. The consistency of the\ndimensional reduction in both cases is analyzed. The dimens ional\nreductionofd=2n+2Born-Infeldgravitytod=2nBIgravity, aswell\nas d=2n BI gravity to d=2n-1 CS gravity is hard to achieve. Thu s\n4d gravity (plus a topological term) can be embedded into d=2 n+1\n

In [60]:
df_with_pdf_text['Cleaned_Text'] = df_with_pdf_text['PDF_Text'].apply(clean_text)
df_with_pdf_text

Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,PDF_Text,Cleaned_Text
0,D=4 Einstein gravity from higher D CS and BI g...,[Horatiu Nastase],2007-03-05T08:56:06Z,2007-03-30T08:02:24Z,An alternative to usual dimensional reduction ...,http://arxiv.org/abs/hep-th/0703034v2,arXiv:hep-th/0703034v2 30 Mar 2007hep-th/0703...,arxivhepthv mar hepth tithep einstein gravity ...
1,How to quantize gravity and how not to quantiz...,[Philip D. Mannheim],2022-09-29T18:43:49Z,2023-01-16T02:34:47Z,Taking the quantization of electromagnetism as...,http://arxiv.org/abs/2209.15047v2,Springer Nature 2021 L ATEX template\nHow to q...,springer nature l atex template quantize gravi...
2,KPZ Formulas for Weyl-Invariant Induced Gravit...,[G. Amelino-Camelia],1997-01-28T15:32:11Z,1997-01-28T15:32:11Z,I discuss the applicability in Weyl-invariant ...,http://arxiv.org/abs/hep-th/9701161v1,arXiv:hep-th/9701161v1 28 Jan 19971\nKPZ Form...,arxivhepthv jan kpz formula weylinvariant indu...
3,On the no-gravity limit of gravity,"[J. Kowalski-Glikman, M. Szczachor]",2012-12-21T19:55:49Z,2012-12-21T19:55:49Z,We argue that Relative Locality may arise in t...,http://arxiv.org/abs/1212.5572v1,arXiv:1212.5572v1 [gr-qc] 21 Dec 2012Septemb...,arxivv grqc dec september wspc proceeding trim...
4,"Comment on ""Inconsistencies in Verlinde's emer...",[Youngsub Yoon],2020-03-05T15:47:58Z,2020-03-05T15:47:58Z,"In 2016, Erik Verlinde proposed a new theory o...",http://arxiv.org/abs/2003.03198v1,arXiv:2003.03198v1 [gr-qc] 5 Mar 2020Comment...,arxivv grqc mar comment inconsistency verlinde...


In [61]:
# print(df['PDF_Text'].iloc[0])


In [62]:
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
# https://huggingface.co/com3dian/Bart-large-paper2slides-summarizer/blob/main/README.md?code=true#L22

In [63]:
# Load the BART-large-paper2slides-summarizer model and tokenizer
model_name = "com3dian/Bart-large-paper2slides-summarizer"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [64]:
# Function to summarize text using the loaded model and tokenizer
def summarize_text(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
    output = model.generate(input_ids, max_length=1000, min_length=40, length_penalty=1.0, num_beams=4, early_stopping=False)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

In [65]:
%%time

# Summarize all the texts in the 'PDF_Text' column of the DataFrame
df_with_pdf_text['new_Summary'] = df_with_pdf_text['PDF_Text'].apply(summarize_text)

CPU times: user 1min 11s, sys: 9.86 s, total: 1min 21s
Wall time: 48.2 s


In [66]:
df_with_pdf_text

Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,PDF_Text,Cleaned_Text,new_Summary
0,D=4 Einstein gravity from higher D CS and BI g...,[Horatiu Nastase],2007-03-05T08:56:06Z,2007-03-30T08:02:24Z,An alternative to usual dimensional reduction ...,http://arxiv.org/abs/hep-th/0703034v2,arXiv:hep-th/0703034v2 30 Mar 2007hep-th/0703...,arxivhepthv mar hepth tithep einstein gravity ...,An alternative to usual dimensional reduction ...
1,How to quantize gravity and how not to quantiz...,[Philip D. Mannheim],2022-09-29T18:43:49Z,2023-01-16T02:34:47Z,Taking the quantization of electromagnetism as...,http://arxiv.org/abs/2209.15047v2,Springer Nature 2021 L ATEX template\nHow to q...,springer nature l atex template quantize gravi...,Springer Nature 2021 L ATEX template How to qu...
2,KPZ Formulas for Weyl-Invariant Induced Gravit...,[G. Amelino-Camelia],1997-01-28T15:32:11Z,1997-01-28T15:32:11Z,I discuss the applicability in Weyl-invariant ...,http://arxiv.org/abs/hep-th/9701161v1,arXiv:hep-th/9701161v1 28 Jan 19971\nKPZ Form...,arxivhepthv jan kpz formula weylinvariant indu...,Idiscusstheapplicability in Weyl-invariantindu...
3,On the no-gravity limit of gravity,"[J. Kowalski-Glikman, M. Szczachor]",2012-12-21T19:55:49Z,2012-12-21T19:55:49Z,We argue that Relative Locality may arise in t...,http://arxiv.org/abs/1212.5572v1,arXiv:1212.5572v1 [gr-qc] 21 Dec 2012Septemb...,arxivv grqc dec september wspc proceeding trim...,Relative Locality may arise in the no gravity ...
4,"Comment on ""Inconsistencies in Verlinde's emer...",[Youngsub Yoon],2020-03-05T15:47:58Z,2020-03-05T15:47:58Z,"In 2016, Erik Verlinde proposed a new theory o...",http://arxiv.org/abs/2003.03198v1,arXiv:2003.03198v1 [gr-qc] 5 Mar 2020Comment...,arxivv grqc mar comment inconsistency verlinde...,"In 2016, Erik Verlinde proposed a new theory o..."


In [67]:
# Display the DataFrame with the summarized texts
print(df_with_pdf_text[['Title', 'new_Summary']])

                                               Title  \
0  D=4 Einstein gravity from higher D CS and BI g...   
1  How to quantize gravity and how not to quantiz...   
2  KPZ Formulas for Weyl-Invariant Induced Gravit...   
3                 On the no-gravity limit of gravity   
4  Comment on "Inconsistencies in Verlinde's emer...   

                                         new_Summary  
0  An alternative to usual dimensional reduction ...  
1  Springer Nature 2021 L ATEX template How to qu...  
2  Idiscusstheapplicability in Weyl-invariantindu...  
3  Relative Locality may arise in the no gravity ...  
4  In 2016, Erik Verlinde proposed a new theory o...  


In [72]:
#print(df_with_pdf_text['Title'].iloc[2])
j=4
print(df_with_pdf_text['Summary'][j])
print('\n')
print(df_with_pdf_text['new_Summary'][j])

In 2016, Erik Verlinde proposed a new theory of gravity called "emergent
gravity" by using mathematical formulas used in the theory of elasticity. In
2017, De-Chang Dai and Dejan Stojkovic claimed to point out inconsistencies in
Verlinde's emergent gravity. We point out that their claim was based on
misunderstanding of the dictionary between emergent gravity and theory of
elasticity. In addition, we propose a slightly different formula for Verlinde's
emergent gravity.


In 2016, Erik Verlinde proposed a new theory of gravity calle d “emergent gravity” by using mathematical formulas used in the theory of elastic ity. In 2017, De-Chang Dai and Dejan Stojkovic claimed to point out inconsistencies in Ver linde’s emergent gravity. We point out that their claim was based on misunderstanding o f the dictionary between the theories of elasticity and gravity. In addition, we p ropose a slightly diﬀerentiableformula for Verl Inde's theory.


In [41]:
print(df_with_pdf_text['Summary'].iloc[2])


[{'summary_text': 'quantum mechanic quantization free radiation field addressed following strategy free operator quantum mechanical concept intimate law nature fall realm quantum physic seems territory controlled quantum law despite fact knowledge world classica l divide date back year new thods conceived capture meaning could explain ed term classical concept however question . rely help provided quantum harmonic oscillator able find quantum mechanical result . normally realized according correspondence principle quantum mechanic . quantization alternative quantization procedure start quantum correspondence break n point vector potential .'}]


In [39]:
%%time

# Load the BART-large-paper2slides-summarizer model and tokenizer
model_name = "com3dian/Bart-large-paper2slides-summarizer"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Concatenate all the individual summaries into one large text
all_summaries = "\n".join(df_with_pdf_text['Summary'])

# Function to summarize the combined text
def summarize_combined_text(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True) #, max_length=1000)
    output = model.generate(input_ids, 
                            max_length=1000, 
                            min_length=500, 
                            length_penalty=5.0, 
                            num_beams=4, 
                            early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# Summarize the combined text
final_summary = summarize_combined_text(all_summaries)

# Display the final summary
print(final_summary)

TypeError: sequence item 0: expected str instance, list found