# Sourcing Data ARXiv

Statistics

In [1]:
import urllib.request
import xml.etree.ElementTree as ET
import pandas as pd
import time

search_query = 'cat:stat*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Statistics'
cat_code = 'stat'
subject = 'Statistics'

def make_request_with_delay(url):
    try:
        response = urllib.request.urlopen(url, timeout=10)  # Set a timeout value (e.g., 10 seconds)
        data = response.read().decode('utf-8')
        response.close()
        return data
    except urllib.error.URLError as e:
        print(f"Error: {e}")
        return None

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]
        
        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None

        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)

    if data is None:
        # If there was an error or no data, you can handle it here (e.g., retry or skip)
        print("Skipping this request.")
        return

    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_stat = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_stat = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_stat['Authors'])
        for i in range(max_authors):
            df_stat[f'Author_{i+1}'] = df_stat['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_stat.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_stat.csv'
    df_stat.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_stat  # Return the DataFrame

if __name__ == "__main__":
    df_stat = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_stat.csv


Astrophysics

In [2]:
import urllib.request
import xml.etree.ElementTree as ET
import pandas as pd
import time

search_query = 'cat:astro-ph*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Astrophysics'
cat_code = 'astro-ph'
subject = 'Physics'

def make_request_with_delay(url):
    try:
        response = urllib.request.urlopen(url, timeout=10)  # Set a timeout value (e.g., 10 seconds)
        data = response.read().decode('utf-8')
        response.close()
        return data
    except urllib.error.URLError as e:
        print(f"Error: {e}")
        return None

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)

    if data is None:
        # If there was an error or no data, you can handle it here (e.g., retry or skip)
        print("Skipping this request.")
        return

    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_astroph = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_astroph= pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_astroph['Authors'])
        for i in range(max_authors):
            df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_astroph.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_astroph.csv'
    df_astroph.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_astroph  # Return the DataFrame

if __name__ == "__main__":
    df_astroph = main()  # Get the DataFrame from the main function

  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)
  df_astroph[f'Author_{i+1}'] = df_astroph['Authors'].apply(lambda x: x[i] if len(x) > i else None)


Data saved to arxiv_data_astroph.csv


Condensed Matter

In [3]:
import urllib.request
import xml.etree.ElementTree as ET
import pandas as pd
import time

search_query = 'cat:cond-mat*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Condensed Matter'
cat_code = 'cond-mat'
subject = 'Physics'

def make_request_with_delay(url):
    try:
        response = urllib.request.urlopen(url, timeout=10)  # Set a timeout value (e.g., 10 seconds)
        data = response.read().decode('utf-8')
        response.close()
        return data
    except urllib.error.URLError as e:
        print(f"Error: {e}")
        return None

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)

    if data is None:
        # If there was an error or no data, you can handle it here (e.g., retry or skip)
        print("Skipping this request.")
        return

    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_condmat = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_condmat = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_condmat['Authors'])
        for i in range(max_authors):
            df_condmat[f'Author_{i+1}'] = df_condmat['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_condmat.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_condmat.csv'
    df_condmat.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_condmat  # Return the DataFrame

if __name__ == "__main__":
    df_condmat = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_condmat.csv


General Relativity and Quantum Cosmology

In [4]:
search_query = 'cat:gr-qc*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'General Relativity and Quantum Cosmology'
cat_code = 'gr-qc'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_grqc = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_grqc = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_grqc['Authors'])
        for i in range(max_authors):
            df_grqc[f'Author_{i+1}'] = df_grqc['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_grqc.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_grqc.csv'
    df_grqc.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_grqc  # Return the DataFrame

if __name__ == "__main__":
    df_grqc = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_grqc.csv


High Energy Physics - Experiment


In [5]:
search_query = 'cat:hep-ex*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'High Energy Physics - Experiment'
cat_code = 'hep-ex'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]
        
        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None

        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_hepex = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_hepex = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_hepex['Authors'])
        for i in range(max_authors):
            df_hepex[f'Author_{i+1}'] = df_hepex['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_hepex.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_hepex.csv'
    df_hepex.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_hepex  # Return the DataFrame

if __name__ == "__main__":
    df_hepex = main()  # Get the DataFrame from the main function


Data saved to arxiv_data_hepex.csv


High Energy Physics - Lattice

In [6]:
search_query = 'cat:hep-lat*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'High Energy Physics - Lattice'
cat_code = 'hep-lat'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]
        
        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_heplat = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_heplat = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_heplat['Authors'])
        for i in range(max_authors):
            df_heplat[f'Author_{i+1}'] = df_heplat['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_heplat.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_heplat.csv'
    df_heplat.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_heplat  # Return the DataFrame

if __name__ == "__main__":
    df_heplat = main()  # Get the DataFrame from the main function


Data saved to arxiv_data_heplat.csv


High Energy Physics - Phenomenology

In [7]:
search_query = 'cat:hep-ph*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'High Energy Physics - Phenomenology'
cat_code = 'hep-ph'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_hepph = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_hepph = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_hepph['Authors'])
        for i in range(max_authors):
            df_hepph[f'Author_{i+1}'] = df_hepph['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_hepph.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_hepph.csv'
    df_hepph.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_hepph  # Return the DataFrame

if __name__ == "__main__":
    df_hepph = main()  # Get the DataFrame from the main function


Data saved to arxiv_data_hepph.csv


High Energy Physics - Theory

In [8]:
search_query = 'cat:hep-th*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'High Energy Physics - Theory'
cat_code = 'hep-th'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_hepth = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_hepth = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_hepth['Authors'])
        for i in range(max_authors):
            df_hepth[f'Author_{i+1}'] = df_hepth['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_hepth.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_hepth.csv'
    df_hepth.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_hepth  # Return the DataFrame

if __name__ == "__main__":
    df_hepth = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_hepth.csv


Mathematical Physics

In [9]:
search_query = 'cat:math-ph*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Mathematical Physics'
cat_code = 'math-ph'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_mathph = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_mathph = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_mathph['Authors'])
        for i in range(max_authors):
            df_mathph[f'Author_{i+1}'] = df_mathph['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_mathph.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_mathph.csv'
    df_mathph.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_mathph  # Return the DataFrame

if __name__ == "__main__":
    df_mathph = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_mathph.csv


Nonlinear Sciences

In [10]:
search_query = 'cat:nlin*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Nonlinear Sciences'
cat_code = 'nlin'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_nlin = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_nlin = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_nlin['Authors'])
        for i in range(max_authors):
            df_nlin[f'Author_{i+1}'] = df_nlin['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_nlin.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_nlin.csv'
    df_nlin.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_nlin  # Return the DataFrame

if __name__ == "__main__":
    df_nlin = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_nlin.csv


Nuclear Experiment

In [11]:
search_query = 'cat:nucl-ex*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Nuclear Experiment'
cat_code = 'nucl-ex'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_nuclex = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_nuclex = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_nuclex['Authors'])
        for i in range(max_authors):
            df_nuclex[f'Author_{i+1}'] = df_nuclex['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_nuclex.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_nuclex.csv'
    df_nuclex.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_nuclex  # Return the DataFrame

if __name__ == "__main__":
    df_nuclex = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_nuclex.csv


Nuclear Theory

In [12]:
search_query = 'cat:nucl-th*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Nuclear Theory'
cat_code = 'nucl-th'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_nuclth = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_nuclth = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_nuclth['Authors'])
        for i in range(max_authors):
            df_nuclth[f'Author_{i+1}'] = df_nuclth['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_nuclth.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_nuclth.csv'
    df_nuclth.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_nuclth  # Return the DataFrame

if __name__ == "__main__":
    df_nuclth = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_nuclth.csv


Physics

In [13]:
search_query = 'cat:physics*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Physics'
cat_code = 'physics'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_physics = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_physics = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_physics['Authors'])
        for i in range(max_authors):
            df_physics[f'Author_{i+1}'] = df_physics['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_physics.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_physics.csv'
    df_physics.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_physics  # Return the DataFrame

if __name__ == "__main__":
    df_physics = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_physics.csv


Quantum Physics

In [14]:
search_query = 'cat:quant-ph*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Quantum Physics'
cat_code = 'quant-ph'
subject = 'Physics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_quantph = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_quantph = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_quantph['Authors'])
        for i in range(max_authors):
            df_quantph[f'Author_{i+1}'] = df_quantph['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_quantph.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_quantph.csv'
    df_quantph.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_quantph  # Return the DataFrame

if __name__ == "__main__":
    df_quantph = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_quantph.csv


Mathematics

In [15]:
search_query = 'cat:math*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Mathematics'
cat_code = 'math'
subject = 'Mathematics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_math = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_math = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_math['Authors'])
        for i in range(max_authors):
            df_math[f'Author_{i+1}'] = df_math['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_math.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_math.csv'
    df_math.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_math  # Return the DataFrame

if __name__ == "__main__":
    df_math = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_math.csv


Computing Research Repository

In [16]:
search_query = 'cat:CoRR*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Computing Research Repository'
cat_code = 'CoRR'
subject = 'Computer Science'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
            
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_CoRR = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_CoRR = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_CoRR['Authors'])
        for i in range(max_authors):
            df_CoRR[f'Author_{i+1}'] = df_CoRR['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_CoRR.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_CoRR.csv'
    df_CoRR.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_CoRR  # Return the DataFrame

if __name__ == "__main__":
    df_CoRR = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_CoRR.csv


Quantitative Biology

In [17]:
search_query = 'cat:q-bio*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Quantitative Biology'
cat_code = 'q-bio'
subject = 'Quantitative Biology'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_qbio = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_qbio = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_qbio['Authors'])
        for i in range(max_authors):
            df_qbio[f'Author_{i+1}'] = df_qbio['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_qbio.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_qbio.csv'
    df_qbio.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_qbio  # Return the DataFrame

if __name__ == "__main__":
    df_qbio = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_qbio.csv


Quantitative Finance

In [18]:
search_query = 'cat:q-fin*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Quantitative Finance'
cat_code = 'q-fin'
subject = 'Quantitative Finance'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_qfin = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_qfin = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_qfin['Authors'])
        for i in range(max_authors):
            df_qfin[f'Author_{i+1}'] = df_qfin['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_qfin.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_qfin.csv'
    df_qfin.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_qfin  # Return the DataFrame

if __name__ == "__main__":
    df_qfin = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_qfin.csv


Electrical Engineering and Systems Science

In [19]:
search_query = 'cat:eess*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Electrical Engineering and Systems Science'
cat_code = 'eess'
subject = 'Electrical Engineering and Systems Science'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_eess = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_eess = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_eess['Authors'])
        for i in range(max_authors):
            df_eess[f'Author_{i+1}'] = df_eess['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_eess.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_eess.csv'
    df_eess.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_eess  # Return the DataFrame

if __name__ == "__main__":
    df_eess = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_eess.csv


Economics

In [20]:
search_query = 'cat:econ*'
start_index = 0
max_results = 1000  # Set the desired number of results

# Add the desired values for Category, Cat_code, and Subject
category = 'Economics'
cat_code = 'econ'
subject = 'Economics'

def make_request_with_delay(url):
    response = urllib.request.urlopen(url)
    data = response.read().decode('utf-8')
    response.close()
    return data

def parse_response(response):
    root = ET.fromstring(response)
    entries = root.findall('{http://www.w3.org/2005/Atom}entry')
    
    results = []
    for entry in entries:
        authors = entry.findall('{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')
        author_names = [author.text for author in authors]
        title = entry.find('{http://www.w3.org/2005/Atom}title').text
        published_date = entry.find('{http://www.w3.org/2005/Atom}published').text
        updated_date = entry.find('{http://www.w3.org/2005/Atom}updated').text
        year = published_date[:4]

        # Extract the DOI if available
        doi_element = entry.find('{http://arxiv.org/schemas/atom}doi')
        doi = doi_element.text if doi_element is not None else None
        
        if '2018' <= year <= '2023':
            results.append({
                'DOI': doi,  # Include the DOI
                'Category': category,
                'Cat_code': cat_code,
                'Subject': subject,
                'Authors': author_names,
                'Title': title,
                'Published Date': published_date,
                'Updated Date': updated_date,
                'Year': year
            })
    
    return results

def main():
    url = f'http://export.arxiv.org/api/query?search_query={search_query}&start={start_index}&max_results={max_results}'
    data = make_request_with_delay(url)
    parsed_data = parse_response(data)

    if not parsed_data:
        # If no data, create an empty DataFrame with desired columns
        df_econ = pd.DataFrame(columns=['Category', 'Cat_code', 'Subject', 'Authors', 'Title', 'Published Date', 'Updated Date', 'Year'])
    else:
        # Create DataFrame
        df_econ = pd.DataFrame(parsed_data)

        # Split authors into separate columns
        max_authors = max(len(authors) for authors in df_econ['Authors'])
        for i in range(max_authors):
            df_econ[f'Author_{i+1}'] = df_econ['Authors'].apply(lambda x: x[i] if len(x) > i else None)

        # Drop the original 'Authors' column
        df_econ.drop('Authors', axis=1, inplace=True)

    # Save DataFrame to a CSV file
    csv_filename = 'arxiv_data_econ.csv'
    df_econ.to_csv(csv_filename, index=False)
    print(f'Data saved to {csv_filename}')

    # Introduce a delay of 3 seconds before making the next request
    time.sleep(3)

    return df_econ  # Return the DataFrame

if __name__ == "__main__":
    df_econ = main()  # Get the DataFrame from the main function

Data saved to arxiv_data_econ.csv


Consolidated DF

In [21]:
consolidated_df = pd.concat([df_astroph, df_condmat, df_grqc, df_hepex, df_heplat, df_hepph, df_hepth, df_mathph, df_nlin, df_nuclex, 
                             df_nuclth, df_physics, df_quantph, df_math, df_CoRR, df_qbio, df_qfin, df_stat, df_eess, df_econ], axis=0, join='outer', ignore_index=True)

# Display the consolidated DataFrame
print(consolidated_df)

# Save DataFrame to a CSV file
csv_filename = 'arxiv_data_consol.csv'
consolidated_df.to_csv(csv_filename, index=False)
print(f'Data saved to {csv_filename}')


                              DOI      Category  Cat_code    Subject  \
0                            None  Astrophysics  astro-ph    Physics   
1                            None  Astrophysics  astro-ph    Physics   
2                            None  Astrophysics  astro-ph    Physics   
3     10.1051/0004-6361/202243511  Astrophysics  astro-ph    Physics   
4     10.1051/0004-6361/202244242  Astrophysics  astro-ph    Physics   
...                           ...           ...       ...        ...   
5399                         None     Economics      econ  Economics   
5400                         None     Economics      econ  Economics   
5401               10.1086/718983     Economics      econ  Economics   
5402   10.1007/s00199-023-01485-1     Economics      econ  Economics   
5403                         None     Economics      econ  Economics   

                                                  Title        Published Date  \
0     (Un)conscious Bias in the Astronomical Profess..

In [22]:
consolidated_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5404 entries, 0 to 5403
Columns: 459 entries, DOI to Authors
dtypes: object(459)
memory usage: 18.9+ MB
