In [1]:
import requests
import os
import re
import emoji
import pandas as pd
from collections import Counter, defaultdict
import string
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [2]:
def text_cleaner(input_text):
    # Remove URLs
    text_without_urls = re.sub(r'http\S+', '', input_text)
    
    # Remove punctuation, including special characters like '›'
    clean_text = ''.join([char if char not in string.punctuation + '›' else ' ' for char in text_without_urls])
    
    # Convert text to lowercase and split into words
    words = clean_text.lower().split()
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    return ' '.join(words)  # Join the words back into a string

In [3]:
def fetch_arxiv_data(search_query, max_results=10, start_date=None, end_date=None, primary_category=None, categories=None):
    # Define the API endpoint URL for searching articles
    api_url = "http://export.arxiv.org/api/query"

    # Define additional parameters like start and max_results
    params = {
        "search_query": search_query,
        "start": 0,  # Start index of results
        "max_results": max_results,  # Maximum number of results to retrieve
    }

    # Add start and end date parameters if provided
    if start_date is not None:
        params["start_date"] = start_date
    if end_date is not None:
        params["end_date"] = end_date

    # Add primary category and categories parameters if provided
    if primary_category is not None:
        params["cat"] = primary_category
    if categories is not None:
        params["categories"] = categories

    # Send an HTTP GET request to the arXiv API
    response = requests.get(api_url, params=params)

    # Check if the request was successful (HTTP status code 200)
    if response.status_code == 200:
        # Parse the response content using BeautifulSoup
        soup = BeautifulSoup(response.content, "xml")

        # Create a list to store article information
        articles = []

        # Extract article information from the parsed content
        entries = soup.find_all("entry")
        for entry in entries:
            title = entry.find("title").text
            paper_id = entry.find("id").text
            published = entry.find("published").text
            updated = entry.find("updated").text
            summary = entry.find("summary").text
            author = [author.text for author in entry.find_all("author")]
            comments = entry.find("arxiv:comment").text if entry.find("arxiv:comment") else ""
            journal_ref = entry.find("arxiv:journal_ref").text if entry.find("arxiv:journal_ref") else ""
            link = entry.find("link")["href"] if entry.find("link") else ""
            primary_category = entry.find("arxiv:primary_category")["term"] if entry.find("arxiv:primary_category") else ""
            categories = [cat["term"] for cat in entry.find_all("category")]
            doi = entry.find("arxiv:doi").text if entry.find("arxiv:doi") else ""
            license = entry.find("arxiv:license")["type"] if entry.find("arxiv:license") else ""
            affiliation = [aff.text for aff in entry.find_all("arxiv:affiliation")]


            # Append article information to the list
            articles.append({
                "Title": title,
                "ID": paper_id,
                "Published": published,
                "Updated": updated,
                "Summary": summary,
                "Author": author,
                "Comments": comments,
                "Journal_Ref": journal_ref,
                "Link": link,
                "Primary_Category": primary_category,
                "Categories": categories,
                "DOI": doi,
                "License": license,
                "Affiliation": affiliation,
            })

        # Create a DataFrame from the list of articles
        df = pd.DataFrame(articles)

        # Return the DataFrame
        return df

    else:
        print("Failed to retrieve data from arXiv API")
        return None

In [4]:
def extract_text_from_link(link):
    try:
        # Send an HTTP GET request to the link without verifying SSL certificates
        response = requests.get(link, verify=False)
        if response.status_code == 200:
            # Parse the HTML content using BeautifulSoup
            soup = BeautifulSoup(response.content, "html.parser")
            # Extract text from the HTML content
            text = soup.get_text()
            return text
        else:
            print(f"Failed to retrieve content from {link}")
            return None
    except Exception as e:
        print(f"Error occurred while fetching content from {link}: {str(e)}")
        return None

In [5]:
def process_links(df):
    # Create a new column for cleaned and tokenized text
    df['Cleaned_Text'] = ''
    
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract link from the "Link" column
        link = row['Link']
        # Extract text content from the link
        text = extract_text_from_link(link)
        if text:
            # Clean and tokenize the text
            cleaned_text = text_cleaner(text)
            # Update the corresponding row in the DataFrame
            df.at[index, 'Cleaned_Text'] = cleaned_text
        else:
            # If unable to fetch content, leave the "Cleaned_Text" column empty for that row
            df.at[index, 'Cleaned_Text'] = ''
        
        # Split the cleaned text into words
        cleaned_text_words = cleaned_text.split()
        # Join the words with commas
        cleaned_text_comma_separated = ", ".join(cleaned_text_words)
    
    return df

In [10]:
%%time
# Fetch data from arXiv and store it in a DataFrame
df = fetch_arxiv_data(search_query="quantum physics", max_results=10, start_date="2023-01-01", end_date="2023-12-31", primary_category="quant-ph", categories=["quant-ph", "cond-mat"])

# Process links in the DataFrame to clean and tokenize text
df_with_cleaned_text = process_links(df)

CPU times: total: 2.02 s
Wall time: 9.67 s


In [11]:
df_with_cleaned_text

Unnamed: 0,Title,ID,Published,Updated,Summary,Author,Comments,Journal_Ref,Link,Primary_Category,Categories,DOI,License,Affiliation,Cleaned_Text
0,Nonlinear Dynamics In Quantum Physics -- Quant...,http://arxiv.org/abs/quant-ph/0302169v1,2003-02-21T20:20:47Z,2003-02-21T20:20:47Z,We discuss the recently proposed quantum act...,[\nH. Kröger\n],"inv. talk, 10th Int. Conf. Comput. and Appl. M...",,http://arxiv.org/abs/quant-ph/0302169v1,quant-ph,[quant-ph],,,[],quant ph 0302169v1 nonlinear dynamics quantum ...
1,Quantum spherical model,http://arxiv.org/abs/1212.4177v1,2012-12-17T22:06:56Z,2012-12-17T22:06:56Z,"We define a ""quantum spherical model"", a qua...",[\nI. Lyberg\n],,,http://arxiv.org/abs/1212.4177v1,math-ph,"[math-ph, math.MP]",,,[],1212 4177v1 quantum spherical model skip main ...
2,Can classical physics agree with quantum physi...,http://arxiv.org/abs/1504.03207v1,2015-04-13T15:04:26Z,2015-04-13T15:04:26Z,Classical physics fails where quantum physic...,[\nMichele Marrocco\n],9 pages,,http://arxiv.org/abs/1504.03207v1,physics.class-ph,"[physics.class-ph, quant-ph]",,,[],1504 03207v1 classical physics agree quantum p...
3,The Physics of Quantum Information,http://arxiv.org/abs/2208.08064v1,2022-08-17T04:35:36Z,2022-08-17T04:35:36Z,Rapid ongoing progress in quantum informatio...,[\nJohn Preskill\n],23 pages. Overview talk at the 28th Solvay Con...,,http://arxiv.org/abs/2208.08064v1,quant-ph,"[quant-ph, cond-mat.str-el, hep-th]",,,[],2208 08064v1 physics quantum information skip ...
4,Topologization of electron liquids with Chern-...,http://arxiv.org/abs/cond-mat/0601285v1,2006-01-13T09:06:10Z,2006-01-13T09:06:10Z,"We discuss a nexus among quantum topology, q...",[\nZhenghan Wang\n],,,http://arxiv.org/abs/cond-mat/0601285v1,cond-mat.mes-hall,"[cond-mat.mes-hall, math-ph, math.MP]",,,[],cond mat 0601285v1 topologization electron liq...
5,"Operational Quantum Mechanics, Quantum Axiomat...",http://arxiv.org/abs/0811.2516v1,2008-11-15T18:37:02Z,2008-11-15T18:37:02Z,"The role of operational quantum mechanics, q...",[\nDiederik Aerts\n],6 pages,"In D. Greenberger, K. Hentschel and F. Wienert...",http://dx.doi.org/10.1007/978-3-540-70626-7,physics.hist-ph,"[physics.hist-ph, quant-ph]",10.1007/978-3-540-70626-7,,[],compendium quantum physics concepts experiment...
6,Universal Uncertainty Principle in Different Q...,http://arxiv.org/abs/1807.11019v1,2018-07-29T07:50:32Z,2018-07-29T07:50:32Z,This paper deduces universal uncertainty pri...,"[\nC. Huang\n, \nYong-Chang Huang\n]",15 pages,,http://arxiv.org/abs/1807.11019v1,quant-ph,[quant-ph],,,[],1807 11019v1 universal uncertainty principle d...
7,Quantum Computers and Quantum Computer Languag...,http://arxiv.org/abs/quant-ph/0201082v1,2002-01-18T15:08:05Z,2002-01-18T15:08:05Z,We show a representation of Quantum Computer...,[\nStephen Blaha\n],32 pages,,http://arxiv.org/abs/quant-ph/0201082v1,quant-ph,"[quant-ph, cs.PL]",,,[],quant ph 0201082v1 quantum computers quantum c...
8,Probabilistic foundations of quantum mechanics...,http://arxiv.org/abs/quant-ph/0309066v1,2003-09-08T09:37:27Z,2003-09-08T09:37:27Z,We discuss foundation of quantum mechanics (...,[\nAndrei Khrennikov\n],Contextual probabilistic viewpoint to quantum ...,,http://dx.doi.org/10.1117/12.517850,quant-ph,[quant-ph],10.1117/12.517850,,[],probabilistic foundations quantum mechanics qu...
9,From quantum graphs to quantum random walks,http://arxiv.org/abs/quant-ph/0504224v1,2005-04-29T09:41:00Z,2005-04-29T09:41:00Z,We give a short overview over recent develop...,[\nGregor Tanner\n],"14 pages, 6 figures",,http://dx.doi.org/10.1007/1-4020-3949-2_6,quant-ph,[quant-ph],10.1007/1-4020-3949-2_6,,[],quantum graphs quantum random walks springerli...


In [12]:
# Iterate over the first 5 rows
for index, row in df_with_cleaned_text.head().iterrows():
    print(f"Row {index + 1}:")
    print("Summary:", row['Summary'])
    print("Cleaned Text:\n", row['Cleaned_Text'])
    print()  # Print a blank line for better readability

Row 1:
Summary:   We discuss the recently proposed quantum action - its interpretation, its
motivation, its mathematical properties and its use in physics: quantum
mechanical tunneling, quantum instantons and quantum chaos.

Cleaned Text:
 quant ph 0302169v1 nonlinear dynamics quantum physics quantum chaos quantum instantons skip main content gratefully acknowledge support simons foundation member institutions contributors donate quant ph arxiv quant ph 0302169v1 help advanced search fields title author abstract comments journal reference acm classification msc classification report number arxiv identifier doi orcid arxiv author id help pages full text search open search go open navigation menu quick links login help pages quantum physics arxiv quant ph 0302169v1 quant ph submitted 21 feb 2003 title nonlinear dynamics quantum physics quantum chaos quantum instantons authors h kröger download pdf paper titled nonlinear dynamics quantum physics quantum chaos quantum instantons h kr oger 

In [13]:
def get_most_common_words(df):
    most_common_words = []
    
    # Iterate over each row
    for index, row in df.iterrows():
        # Split the cleaned text into words
        cleaned_text_words = row['Cleaned_Text'].split()
        
        # Count the occurrences of each word
        word_counts = Counter(cleaned_text_words)
        
        # Get the 5 most common words
        top_words = word_counts.most_common(5)
        
        # Append the most common words to the list
        most_common_words.append((index, top_words))
    
    return most_common_words

In [14]:
# Example usage
most_common_words = get_most_common_words(df_with_cleaned_text)

# Print the 5 most common words for each row
for index, top_words in most_common_words:
    print(f"Row {index + 1}:")
    for word, count in top_words:
        print(f"{word}: {count}")
    print()  # Print a blank line for better readability

Row 1:
quantum: 18
arxiv: 13
toggle: 12
quant: 9
ph: 9

Row 2:
arxiv: 13
toggle: 12
math: 8
code: 7
papers: 7

Row 3:
physics: 23
quantum: 13
arxiv: 13
toggle: 12
classical: 9

Row 4:
quantum: 17
arxiv: 13
toggle: 12
physics: 10
information: 7

Row 5:
arxiv: 13
toggle: 13
cond: 11
mat: 11
quantum: 7

Row 6:
pages: 21
quantum: 19
physics: 19
book: 17
history: 15

Row 7:
quantum: 16
arxiv: 13
toggle: 12
uncertainty: 10
principle: 9

Row 8:
quantum: 25
arxiv: 13
toggle: 12
language: 10
quant: 9

Row 9:
journal: 28
quantum: 24
spie: 24
proceedings: 14
sign: 13

Row 10:
google: 39
scholar: 38
ads: 29
phys: 26
crossref: 26



In [13]:
def process_pdf_links(df):
    if df.empty:
        print("The DataFrame is empty. No links to process.")
        return df
    
    # Iterate through each row
    for index, row in df.iterrows():
        # Extract link from the "ID" column
        link = row['ID']  # Update this if the PDF link is stored in a different column
        print(f"Processing link: {link}")  # Debugging log
        # Download PDF content from the link
        pdf_io = download_pdf_from_link(link)
        if pdf_io:
            # Extract text content from the PDF
            text = extract_text_from_pdf(pdf_io)
            print(f"Extracted text length: {len(text)}")  # Debugging log
            # Update the corresponding row in the DataFrame with the extracted text
            df.at[index, 'PDF_Text'] = text
        else:
            print(f"Failed to process link: {link}")
    return df

# Fetch data from arXiv and store it in a DataFrame
df = fetch_arxiv_data(search_query="quantum physics", max_results=10, start_date="2023-01-01", end_date="2023-12-31", primary_category="quant-ph", categories=["quant-ph", "cond-mat"])

# Process links in the DataFrame to download PDFs and extract their text
df_with_pdf_text = process_pdf_links(df)

Processing link: http://arxiv.org/abs/quant-ph/0302169v1


NameError: name 'download_pdf_from_link' is not defined

In [None]:
https://arxiv.org/abs/quant-ph/0302169v1

In [None]:
https://arxiv.org/pdf/quant-ph/0302169v1.pdf

In [21]:
import requests
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
import io
import pandas as pd

def fetch_arxiv_data(search_query, max_results=10, start_date=None, end_date=None, primary_category=None, categories=None):
    api_url = "http://export.arxiv.org/api/query"
    params = {
        "search_query": search_query,
        "start": 0,
        "max_results": max_results,
    }
    if start_date:
        params["start_date"] = start_date
    if end_date:
        params["end_date"] = end_date
    if primary_category:
        params["cat"] = primary_category
    if categories:
        params["categories"] = categories

    response = requests.get(api_url, params=params)
    if response.status_code == 200:
        feed = BeautifulSoup(response.content, features="html.parser")
        entries = feed.find_all('entry')
        articles = []

        for entry in entries:
            article = {}
            article['Title'] = entry.title.text
            article['Authors'] = [author.find('name').text for author in entry.find_all('author')]
            article['Published'] = entry.published.text
            article['Updated'] = entry.updated.text
            article['Summary'] = entry.summary.text.strip()
            article['ID'] = entry.id.text
            articles.append(article)

        df = pd.DataFrame(articles)
        return df
    else:
        print("Failed to retrieve data from arXiv API")
        return None

def download_pdf_from_link(link):
    response = requests.get(link, stream=True)
    if response.status_code == 200:
        return io.BytesIO(response.content)
    else:
        print(f"Failed to download PDF from {link}")
        return None

def extract_text_from_pdf(pdf_io):
    if pdf_io:
        try:
            reader = PdfReader(pdf_io)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
        except Exception as e:
            print(f"Error occurred while extracting text from PDF: {str(e)}")
            return ""

def process_pdf_links(df):
    df['PDF_Text'] = ''
    for index, row in df.iterrows():
        link = row['ID']
        # We need to convert the '/abs/' URL to a '/pdf/' URL and ensure it ends with '.pdf'
        link = link.replace('abs', 'pdf')
        if not link.endswith('.pdf'):
            link += '.pdf'
        print(f"Processing link: {link}")
        pdf_io = download_pdf_from_link(link)
        if pdf_io:
            text = extract_text_from_pdf(pdf_io)
            df.at[index, 'PDF_Text'] = text
    return df



In [22]:
# Example usage
df = fetch_arxiv_data(search_query="quantum physics", max_results=10, start_date="2023-01-01", end_date="2023-12-31", primary_category="quant-ph", categories=["quant-ph", "cond-mat"])
df_with_pdf_text = process_pdf_links(df)
print(df_with_pdf_text)
df_with_pdf_text.to_csv('arxiv_data_with_pdf_text.csv', index=False)

Processing link: http://arxiv.org/pdf/quant-ph/0302169v1.pdf
Processing link: http://arxiv.org/pdf/1212.4177v1.pdf
Processing link: http://arxiv.org/pdf/1504.03207v1.pdf
Processing link: http://arxiv.org/pdf/2208.08064v1.pdf
Processing link: http://arxiv.org/pdf/cond-mat/0601285v1.pdf
Processing link: http://arxiv.org/pdf/0811.2516v1.pdf
Processing link: http://arxiv.org/pdf/1807.11019v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0201082v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0309066v1.pdf
Processing link: http://arxiv.org/pdf/quant-ph/0504224v1.pdf
                                               Title  \
0  Nonlinear Dynamics In Quantum Physics -- Quant...   
1                            Quantum spherical model   
2  Can classical physics agree with quantum physi...   
3                 The Physics of Quantum Information   
4  Topologization of electron liquids with Chern-...   
5  Operational Quantum Mechanics, Quantum Axiomat...   
6  Universal Uncertainty Princ

In [23]:
print(df['PDF_Text'].iloc[0])


arXiv:quant-ph/0302169v1  21 Feb 2003PROCEEDINGS OF THE TENTH INTERNATIONAL CONFERENCE ON
COMPUTATIONAL AND APPLIED MATHEMATICS
July 22 – 26, 2002, Leuven, Belgium pp.1–10
NONLINEAR DYNAMICS IN QUANTUM PHYSICS –
QUANTUM CHAOS AND QUANTUM INSTANTONS
Helmut Kr ¨oger
D´ epartement de Physique
Universit´ e Laval
Qu´ ebec, Qu´ ebec G1K 7P4, Canada
Abstract. We discuss the recently proposed quantum action - its interp retation, its moti-
vation, its mathematical properties and its use in physics: quantum mechanical tunneling,
quantum instantons and quantum chaos.
1.Introduction. Modern physics returns to some of its origins dating back
to the ﬁrst part of the last century. Examples are entanglement, according to
Schr¨ odinger themost peculiar property occuring inquantum mech anics, orthe
condensationofvery coldatomspredictedbyEinsteinandBose(Bo se-Einstein
condensate). Another example is nonlinear dynamics and chaos, da ting back
to the work of Poincar´ e and others and its modern descendent