In [1]:
# Imports
import requests  
from bs4 import BeautifulSoup  
import pandas as pd  
import io  
from PyPDF2 import PdfReader  
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
# https://huggingface.co/com3dian/Bart-large-paper2slides-summarizer/blob/main/README.md?code=true#L22

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>
[nltk_data] Error loading wordnet: <urlopen error [WinError 10054] An
[nltk_data]     existing connection was forcibly closed by the remote
[nltk_data]     host>


In [2]:
def fetch_arxiv_data(search_query, max_results=10, start_date=None, end_date=None, primary_category=None, categories=None):
    """
    Fetches data from the arXiv API based on specified parameters.

    Parameters:
        search_query (str): The search query string.
        max_results (int): The maximum number of results to retrieve (default is 10).
        start_date (str): The start date for the search query in the format 'YYYY-MM-DD'.
        end_date (str): The end date for the search query in the format 'YYYY-MM-DD'.
        primary_category (str): The primary category of the articles.
        categories (str): Additional categories for the articles.

    Returns:
        DataFrame: A pandas DataFrame containing the fetched arXiv data.
    """
    api_url = "http://export.arxiv.org/api/query"  # Defining the API URL for arXiv
    params = {
        "search_query": search_query,  # Setting search query parameter
        "start": 0,  # Setting start parameter for pagination
        "max_results": max_results,  # Setting maximum results parameter
    }
    if start_date:
        params["start_date"] = start_date  # Adding start date parameter if provided
    if end_date:
        params["end_date"] = end_date  # Adding end date parameter if provided
    if primary_category:
        params["cat"] = primary_category  # Adding primary category parameter if provided
    if categories:
        params["categories"] = categories  # Adding additional categories parameter if provided

    response = requests.get(api_url, params=params)  # Making a GET request to arXiv API with specified parameters
    if response.status_code == 200:  # Checking if the request was successful
        feed = BeautifulSoup(response.content, features="html.parser")  # Parsing the response content using BeautifulSoup
        entries = feed.find_all('entry')  # Finding all 'entry' elements in the parsed content
        articles = []

        for entry in entries:  # Looping through each entry
            article = {}  # Creating an empty dictionary to store article data
            article['Title'] = entry.title.text  # Extracting title of the article
            article['Authors'] = [author.find('name').text for author in entry.find_all('author')]  # Extracting authors of the article
            article['Published'] = entry.published.text  # Extracting publication date
            article['Updated'] = entry.updated.text  # Extracting last updated date
            article['Summary'] = entry.summary.text.strip()  # Extracting summary of the article
            article['ID'] = entry.id.text  # Extracting unique ID of the article
            articles.append(article)  # Appending article data to the list

        df = pd.DataFrame(articles)  # Creating a pandas DataFrame from the list of articles
        return df  # Returning the DataFrame
    else:
        print("Failed to retrieve data from arXiv API")  # Printing error message if request fails
        return None

def download_pdf_from_link(link):
    """
    Downloads a PDF file from the given URL.

    Parameters:
        link (str): The URL of the PDF file.

    Returns:
        io.BytesIO or None: BytesIO object containing the PDF content if successful, else None.
    """
    response = requests.get(link, stream=True)  # Making a GET request to download the PDF file
    if response.status_code == 200:  # Checking if the request was successful
        return io.BytesIO(response.content)  # Returning BytesIO object containing the PDF content
    else:
        print(f"Failed to download PDF from {link}")  # Printing error message if download fails
        return None

def extract_text_from_pdf(pdf_io):
    """
    Extracts text from a PDF file.

    Parameters:
        pdf_io (io.BytesIO): BytesIO object containing the PDF content.

    Returns:
        str: Extracted text from the PDF.
    """
    if pdf_io:
        try:
            reader = PdfReader(pdf_io)  # Creating a PdfReader object with the PDF content
            text = ""
            for page in reader.pages:  # Looping through each page in the PDF
                text += page.extract_text() + "\n"  # Extracting text from the page and appending it to the 'text' variable
            return text  # Returning the extracted text
        except Exception as e:
            print(f"Error occurred while extracting text from PDF: {str(e)}")  # Printing error message if extraction fails
            return ""

def process_pdf_links(df):
    """
    Processes PDF links in a DataFrame by downloading and extracting text from them.

    Parameters:
        df (DataFrame): The pandas DataFrame containing PDF links.

    Returns:
        DataFrame: The DataFrame with an additional column containing extracted text from PDFs.
    """
    df['PDF_Text'] = ''  # Adding an empty column 'PDF_Text' to the DataFrame
    for index, row in df.iterrows():  # Looping through each row in the DataFrame
        link = row['ID']  # Extracting the PDF link from the 'ID' column
        # We need to convert the '/abs/' URL to a '/pdf/' URL and ensure it ends with '.pdf'
        link = link.replace('abs', 'pdf')  # Replacing 'abs' with 'pdf' in the URL
        if not link.endswith('.pdf'):  # Checking if the link ends with '.pdf'
            link += '.pdf'  # Appending '.pdf' to the link if it doesn't end with it already
        print(f"Processing link: {link}")  # Printing the processed link
        pdf_io = download_pdf_from_link(link)  # Downloading the PDF file
        if pdf_io:  # Checking if PDF download was successful
            text = extract_text_from_pdf(pdf_io)  # Extracting text from the PDF
            df.at[index, 'PDF_Text'] = text  # Adding the extracted text to the 'PDF_Text' column in the DataFrame
    return df  # Returning the updated DataFrame

def preprocess_text(text):
    """
    Preprocesses the text by converting it to lowercase, removing digits, and removing punctuation.

    Parameters:
        text (str): The input text to be preprocessed.

    Returns:
        str: The preprocessed text.
    """
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

def remove_stopwords(tokens):
    """
    Removes stopwords from the list of tokens.

    Parameters:
        tokens (list): List of tokens.

    Returns:
        list: Tokens with stopwords removed.
    """
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words] 
    return filtered_tokens 

def perform_lemmatization(tokens):
    """
    Performs lemmatization on the list of tokens.

    Parameters:
        tokens (list): List of tokens.

    Returns:
        list: Lemmatized tokens.
    """
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

def clean_text(text):
    """
    Cleans the text by preprocessing, removing stopwords, and performing lemmatization.

    Parameters:
        text (str): The input text to be cleaned.

    Returns:
        str: The cleaned text.
    """
    # Preprocess the text
    text = preprocess_text(text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = remove_stopwords(tokens)
    
    # Perform lemmatization
    lemmatized_tokens = perform_lemmatization(filtered_tokens)
    
    # Join the lemmatized tokens into a string
    cleaned_text = ' '.join(lemmatized_tokens)
    
    return cleaned_text
    

In [3]:
# Example usage
df = fetch_arxiv_data(search_query="quantum physics", max_results=10, start_date="2023-01-01", end_date="2023-12-31", primary_category="quant-ph", categories=["quant-ph", "cond-mat"])
df_with_pdf_text = process_pdf_links(df)
df_with_pdf_text

Processing link: http://arxiv.org/pdf/quant-ph/0302169v1.pdf
Processing link: http://arxiv.org/pdf/1212.4177v1.pdf
Processing link: http://arxiv.org/pdf/1504.03207v1.pdf
Processing link: http://arxiv.org/pdf/2208.08064v1.pdf
Processing link: http://arxiv.org/pdf/cond-mat/0601285v1.pdf
Processing link: http://arxiv.org/pdf/0811.2516v1.pdf
Processing link: http://arxiv.org/pdf/1807.11019v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0201082v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0309066v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0504224v1.pdf


Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,PDF_Text
0,Nonlinear Dynamics In Quantum Physics -- Quant...,[H. Kröger],2003-02-21T20:20:47Z,2003-02-21T20:20:47Z,We discuss the recently proposed quantum actio...,http://arxiv.org/abs/quant-ph/0302169v1,arXiv:quant-ph/0302169v1 21 Feb 2003PROCEEDIN...
1,Quantum spherical model,[I. Lyberg],2012-12-17T22:06:56Z,2012-12-17T22:06:56Z,"We define a ""quantum spherical model"", a quant...",http://arxiv.org/abs/1212.4177v1,arXiv:1212.4177v1 [math-ph] 17 Dec 2012A “qu...
2,Can classical physics agree with quantum physi...,[Michele Marrocco],2015-04-13T15:04:26Z,2015-04-13T15:04:26Z,Classical physics fails where quantum physics ...,http://arxiv.org/abs/1504.03207v1,1 Can classical physics agree with quant um p...
3,The Physics of Quantum Information,[John Preskill],2022-08-17T04:35:36Z,2022-08-17T04:35:36Z,Rapid ongoing progress in quantum information ...,http://arxiv.org/abs/2208.08064v1,arXiv:2208.08064v1 [quant-ph] 17 Aug 2022Aug...
4,Topologization of electron liquids with Chern-...,[Zhenghan Wang],2006-01-13T09:06:10Z,2006-01-13T09:06:10Z,"We discuss a nexus among quantum topology, qua...",http://arxiv.org/abs/cond-mat/0601285v1,arXiv:cond-mat/0601285v1 [cond-mat.mes-hall] ...
5,"Operational Quantum Mechanics, Quantum Axiomat...",[Diederik Aerts],2008-11-15T18:37:02Z,2008-11-15T18:37:02Z,"The role of operational quantum mechanics, qua...",http://arxiv.org/abs/0811.2516v1,arXiv:0811.2516v1 [physics.hist-ph] 15 Nov 2...
6,Universal Uncertainty Principle in Different Q...,"[C. Huang, Yong-Chang Huang]",2018-07-29T07:50:32Z,2018-07-29T07:50:32Z,This paper deduces universal uncertainty princ...,http://arxiv.org/abs/1807.11019v1,1 \n Universal Uncertainty Principle in Diff...
7,Quantum Computers and Quantum Computer Languag...,[Stephen Blaha],2002-01-18T15:08:05Z,2002-01-18T15:08:05Z,We show a representation of Quantum Computers ...,http://arxiv.org/abs/quant-ph/0201082v1,QQuuaannttuumm CCoommppuutteerrss aanndd QQ...
8,Probabilistic foundations of quantum mechanics...,[Andrei Khrennikov],2003-09-08T09:37:27Z,2003-09-08T09:37:27Z,We discuss foundation of quantum mechanics (in...,http://arxiv.org/abs/quant-ph/0309066v1,arXiv:quant-ph/0309066v1 8 Sep 2003Probabilis...
9,From quantum graphs to quantum random walks,[Gregor Tanner],2005-04-29T09:41:00Z,2005-04-29T09:41:00Z,We give a short overview over recent developme...,http://arxiv.org/abs/quant-ph/0504224v1,arXiv:quant-ph/0504224v1 29 Apr 2005From quan...


In [36]:
df_with_pdf_text['Cleaned_Text'] = df_with_pdf_text['PDF_Text'].apply(clean_text)
df_with_pdf_text

Unnamed: 0,Title,Authors,Published,Updated,Summary,ID,PDF_Text,Cleaned_Text
0,Nonlinear Dynamics In Quantum Physics -- Quant...,[H. Kröger],2003-02-21T20:20:47Z,2003-02-21T20:20:47Z,We discuss the recently proposed quantum actio...,http://arxiv.org/abs/quant-ph/0302169v1,arXiv:quant-ph/0302169v1 21 Feb 2003PROCEEDIN...,arxivquantphv feb proceeding tenth internation...
1,Quantum spherical model,[I. Lyberg],2012-12-17T22:06:56Z,2012-12-17T22:06:56Z,"We define a ""quantum spherical model"", a quant...",http://arxiv.org/abs/1212.4177v1,arXiv:1212.4177v1 [math-ph] 17 Dec 2012A “qu...,arxivv mathph dec quantum spherical model tran...
2,Can classical physics agree with quantum physi...,[Michele Marrocco],2015-04-13T15:04:26Z,2015-04-13T15:04:26Z,Classical physics fails where quantum physics ...,http://arxiv.org/abs/1504.03207v1,1 Can classical physics agree with quant um p...,classical physic agree quant um physic quantum...
3,The Physics of Quantum Information,[John Preskill],2022-08-17T04:35:36Z,2022-08-17T04:35:36Z,Rapid ongoing progress in quantum information ...,http://arxiv.org/abs/2208.08064v1,arXiv:2208.08064v1 [quant-ph] 17 Aug 2022Aug...,arxivv quantph aug august wsprocsx wspc procee...
4,Topologization of electron liquids with Chern-...,[Zhenghan Wang],2006-01-13T09:06:10Z,2006-01-13T09:06:10Z,"We discuss a nexus among quantum topology, qua...",http://arxiv.org/abs/cond-mat/0601285v1,arXiv:cond-mat/0601285v1 [cond-mat.mes-hall] ...,arxivcondmatv condmatmeshall jan topologizatio...
5,"Operational Quantum Mechanics, Quantum Axiomat...",[Diederik Aerts],2008-11-15T18:37:02Z,2008-11-15T18:37:02Z,"The role of operational quantum mechanics, qua...",http://arxiv.org/abs/0811.2516v1,arXiv:0811.2516v1 [physics.hist-ph] 15 Nov 2...,arxivv physicshistph nov operational quantum m...
6,Universal Uncertainty Principle in Different Q...,"[C. Huang, Yong-Chang Huang]",2018-07-29T07:50:32Z,2018-07-29T07:50:32Z,This paper deduces universal uncertainty princ...,http://arxiv.org/abs/1807.11019v1,1 \n Universal Uncertainty Principle in Diff...,universal uncertainty principle different quan...
7,Quantum Computers and Quantum Computer Languag...,[Stephen Blaha],2002-01-18T15:08:05Z,2002-01-18T15:08:05Z,We show a representation of Quantum Computers ...,http://arxiv.org/abs/quant-ph/0201082v1,QQuuaannttuumm CCoommppuutteerrss aanndd QQ...,qquuaannttuumm ccoommppuutteerrss aanndd qquua...
8,Probabilistic foundations of quantum mechanics...,[Andrei Khrennikov],2003-09-08T09:37:27Z,2003-09-08T09:37:27Z,We discuss foundation of quantum mechanics (in...,http://arxiv.org/abs/quant-ph/0309066v1,arXiv:quant-ph/0309066v1 8 Sep 2003Probabilis...,arxivquantphv sep probabilistic foundation qua...
9,From quantum graphs to quantum random walks,[Gregor Tanner],2005-04-29T09:41:00Z,2005-04-29T09:41:00Z,We give a short overview over recent developme...,http://arxiv.org/abs/quant-ph/0504224v1,arXiv:quant-ph/0504224v1 29 Apr 2005From quan...,arxivquantphv apr quantum graph quantum random...


In [5]:
# print(df['PDF_Text'].iloc[0])


In [4]:
from transformers import BartTokenizer, BartForConditionalGeneration, pipeline
# https://huggingface.co/com3dian/Bart-large-paper2slides-summarizer/blob/main/README.md?code=true#L22

In [5]:
# Load the BART-large-paper2slides-summarizer model and tokenizer
model_name = "com3dian/Bart-large-paper2slides-summarizer"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)


In [6]:
# Function to summarize text using the loaded model and tokenizer
def summarize_text(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", max_length=1024, truncation=True)
    output = model.generate(input_ids, max_length=500, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

In [7]:
%%time

# Summarize all the texts in the 'PDF_Text' column of the DataFrame
df_with_pdf_text['Summary'] = df_with_pdf_text['PDF_Text'].apply(summarize_text)

CPU times: total: 20min 6s
Wall time: 3min 57s


In [8]:
# Display the DataFrame with the summarized texts
print(df_with_pdf_text[['Title', 'Summary']])

                                               Title  \
0  Nonlinear Dynamics In Quantum Physics -- Quant...   
1                            Quantum spherical model   
2  Can classical physics agree with quantum physi...   
3                 The Physics of Quantum Information   
4  Topologization of electron liquids with Chern-...   
5  Operational Quantum Mechanics, Quantum Axiomat...   
6  Universal Uncertainty Principle in Different Q...   
7  Quantum Computers and Quantum Computer Languag...   
8  Probabilistic foundations of quantum mechanics...   
9        From quantum graphs to quantum random walks   

                                             Summary  
0  quantum mechanical tunneling, quantum instanto...  
1  A “quantum spherical model” with a transverse ...  
2  Classical physics fails where quantum physics ...  
3  Solvay Conference focused on The Physics of Qu...  
4  Chern-Simons functional is used to d eﬁne TQFT...  
5  Operational quantum mechanics and quantum axio... 

In [16]:
print(df_with_pdf_text['Title'].iloc[2])
print(df_with_pdf_text['Summary'].iloc[2])


Can classical physics agree with quantum physics on quantum phenomena?
Classical physics fails where quantum physics prevails. This comm on understanding applies to quantum phenomena that are acknowledged to be beyond the reach of classical physics. The trial run is the quantization of the free radiation field that will be addressed by following  a strategy that is free from operators or quantum -mechanical concepts.


In [41]:
print(df_with_pdf_text['Summary'].iloc[2])


[{'summary_text': 'quantum mechanic quantization free radiation field addressed following strategy free operator quantum mechanical concept intimate law nature fall realm quantum physic seems territory controlled quantum law despite fact knowledge world classica l divide date back year new thods conceived capture meaning could explain ed term classical concept however question . rely help provided quantum harmonic oscillator able find quantum mechanical result . normally realized according correspondence principle quantum mechanic . quantization alternative quantization procedure start quantum correspondence break n point vector potential .'}]


In [39]:
%%time

# Load the BART-large-paper2slides-summarizer model and tokenizer
model_name = "com3dian/Bart-large-paper2slides-summarizer"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

# Concatenate all the individual summaries into one large text
all_summaries = "\n".join(df_with_pdf_text['Summary'])

# Function to summarize the combined text
def summarize_combined_text(text):
    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True) #, max_length=1000)
    output = model.generate(input_ids, 
                            max_length=1000, 
                            min_length=500, 
                            length_penalty=5.0, 
                            num_beams=4, 
                            early_stopping=True)
    summary = tokenizer.decode(output[0], skip_special_tokens=True)
    return summary

# Summarize the combined text
final_summary = summarize_combined_text(all_summaries)

# Display the final summary
print(final_summary)

TypeError: sequence item 0: expected str instance, list found