In [None]:
!pip install requests beautifulsoup4 pandas





In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

def scrape_pubmed(query, max_results=10):
    base_url = "https://pubmed.ncbi.nlm.nih.gov/"
    search_url = f"{base_url}?term={query.replace(' ', '+')}"

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(search_url, headers=headers)
    if response.status_code != 200:
        print("Failed to retrieve data")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    articles = soup.find_all("article", class_="full-docsum", limit=max_results)

    data = []

    for article in articles:
        title = article.find("a", class_="docsum-title")
        title_text = title.text.strip() if title else "N/A"
        link = base_url + title["href"] if title else "N/A"

        authors = article.find("span", class_="docsum-authors full-authors")
        authors_text = authors.text.strip() if authors else "N/A"

        journal = article.find("span", class_="docsum-journal-citation full-journal-citation")
        journal_text = journal.text.strip() if journal else "N/A"

        date = article.find("span", class_="docsum-pubdate")
        date_text = date.text.strip() if date else "N/A"

        data.append({
            "Title": title_text,
            "Authors": authors_text,
            "Journal": journal_text,
            "Publication Date": date_text,
            "Link": link
        })

        # Avoid hitting PubMed too fast
        time.sleep(1)

    return data


In [None]:
# Example usage
query = "COVID-19 vaccines"
results = scrape_pubmed(query, max_results=5)



In [None]:
# Convert results to a DataFrame and display
import pandas as pd

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Display DataFrame in Jupyter Notebook
from IPython.display import display
display(df)


Unnamed: 0,Title,Authors,Journal,Publication Date,Link
0,Covid-19 vaccines and variants of concern: A r...,Hadj Hassine I.,Rev Med Virol. 2022 Jul;32(4):e2313. doi: 10.1...,,https://pubmed.ncbi.nlm.nih.gov//34755408/
1,Tracking the COVID-19 vaccines: The global lan...,"Yadav T, Kumar S, Mishra G, Saxena SK.",Hum Vaccin Immunother. 2023 Dec 31;19(1):21915...,,https://pubmed.ncbi.nlm.nih.gov//36995773/
2,Safety and efficacy of COVID-19 vaccines: A sy...,"Beladiya J, Kumar A, Vasava Y, Parmar K, Patel...",Rev Med Virol. 2024 Jan;34(1):e2507. doi: 10.1...,,https://pubmed.ncbi.nlm.nih.gov//38282394/
3,Comparing COVID-19 vaccines for their characte...,"Fiolet T, Kherabi Y, MacDonald CJ, Ghosn J, Pe...",Clin Microbiol Infect. 2022 Feb;28(2):202-221....,,https://pubmed.ncbi.nlm.nih.gov//34715347/
4,Role of COVID-19 Vaccines in SARS-CoV-2 Variants.,"Zhou Z, Zhu Y, Chu M.",Front Immunol. 2022 May 20;13:898192. doi: 10....,,https://pubmed.ncbi.nlm.nih.gov//35669787/


In [None]:
df.to_csv("pubmed_articles.csv", index=False)
print("Saved as pubmed_articles.csv")


Saved as pubmed_articles.csv


In [28]:
from google.colab import files
files.download("pubmed_articles.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=c030d6537fc3252a19cc00216501a5f396645b07a1838fb33c20b90876d30271
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [36]:
from fpdf import FPDF
pdf_file_path = "pubmed_articles.pdf"
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", style='', size=12)

In [38]:
    # Save the PDF
    pdf.output(pdf_file_path)
    print(f"Saved as {pdf_file_path}")

Saved as pubmed_articles.pdf


In [43]:
# Define file paths
csv_file_path = "pubmed_articles.csv"

# Auto-download CSV and PDF in Google Colab
from google.colab import files
files.download(csv_file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

PDF

In [42]:
from fpdf import FPDF
import pandas as pd
from google.colab import files

# Load the CSV file
csv_file_path = "pubmed_articles.csv"
df = pd.read_csv(csv_file_path)

# Create a PDF document
pdf = FPDF()
pdf.set_auto_page_break(auto=True, margin=15)
pdf.add_page()
pdf.set_font("Arial", style='', size=12)

# Add title
pdf.set_font("Arial", style='B', size=16)
pdf.cell(200, 10, "PubMed Articles Summary", ln=True, align='C')
pdf.ln(10)

# Add table headers
pdf.set_font("Arial", style='B', size=12)
pdf.cell(90, 10, "Title", border=1)
pdf.cell(90, 10, "Authors", border=1)
pdf.ln()

# Add article data
pdf.set_font("Arial", size=10)
for index, row in df.iterrows():
    pdf.cell(90, 10, row["Title"][:40] + "...", border=1)  # Truncate long titles
    pdf.cell(90, 10, row["Authors"][:40] + "...", border=1)  # Truncate long author lists
    pdf.ln()

# Save the PDF
pdf_file_path = "pubmed_articles.pdf"
pdf.output(pdf_file_path)

print(f"Saved as {pdf_file_path}")

# Auto-download the PDF
files.download(pdf_file_path)


Saved as pubmed_articles.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>