In [1]:
import requests
from bs4 import BeautifulSoup
from Bio import Entrez
import time
import pandas as pd
import numpy as np

In [2]:
# Initialize an empty list to store paper details
papers = []

In [3]:
def search(query):
    Entrez.email = 'shamustappa@gmail.com'
    handle = Entrez.esearch(db='pubmed',
    sort='relevance',
    retmax='250000',
    retmode='xml',
    term=query)
    results = Entrez.read(handle)
    return results

In [4]:
studies = search('mechanism of self healing materials in construction')
studiesIdList = studies['IdList']

In [5]:
def fetch_details(id_list):
    ids = ','.join(id_list)
    Entrez.email = 'shamustappa@gmail.com'
    handle = Entrez.efetch(db='pubmed',
    retmode='xml',
    id=ids)
    results = Entrez.read(handle)
    return results

In [6]:
# Initialize lists to store paper details
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []

# Assume fetch_details is a function that takes a list of study IDs and returns detailed data
# studies = fetch_details(studiesIdList)

chunk_size = 10000  # Define the chunk size

# Loop through the list of study IDs in chunks
for chunk_i in range(0, len(studiesIdList), chunk_size):
    chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
    papers = fetch_details(chunk)
    
    for i, paper in enumerate(papers['PubmedArticle']):
        
        # Extract title
        title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
        
        # Extract abstract, if available
        try:
            abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
        except:
            abstract_list.append('No Abstract')
        
        # Extract journal name
        journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
        
        # Extract language
        language_list.append(paper['MedlineCitation']['Article']['Language'][0])
        
        # Extract publication year, if available
        try:
            pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
        except:
            pubdate_year_list.append('No Data')
        
        # Extract publication month, if available
        try:
            pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
        except:
            pubdate_month_list.append('No Data')

# Create a Pandas DataFrame to store all the information
df = pd.DataFrame(list(zip(
    title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list
)),
columns=[
    'Title', 'Abstract', 'Journal', 'Language', 'Year', 'Month'
])

# Save the DataFrame to a CSV file
df.to_csv('mechanisms_pubmed.csv', index=False)